commit 7ea977268a8f09d3af048f7d9de9fdeea9278aac Author: liuqingwen Date: Thu Jul 25 15:09:42 2024 +0800 init diff --git a/excel2word.py b/excel2word.py new file mode 100644 index 0000000..d8e46f0 --- /dev/null +++ b/excel2word.py @@ -0,0 +1,90 @@ +import glob +import pandas as pd +from docx import Document + +# 目录级别后移,最小为0 +level_offset = 1 +name = False +header = False +headline = [] +# sheet页黑名单,当前所有表格中均不含退役军人项,该项为excel模版中带有的sheet页,如有例外需特殊处理。 +sheet_name_black_list = ['填表说明', '数据资源目录结构', '新增需求填报', '退役军人'] +column_black_list = ['期望更新周期'] + +# 获取全部excel表格 +# xlsx_files = glob.glob('./附件2*.xlsx') + +# sheet_name_black_list = ['填表说明', '数据资源目录结构', '新增需求填报'] +xlsx_files = ['附件2:自治区文化旅游主题共享资源库需求对接表_3932232206984868.xlsx'] + +for file in xlsx_files: + # 打开模板word文件 + doc = Document('temp.docx') + print(f"正在处理文件: {file}") + + # 读取Excel文件 + xls = pd.ExcelFile(file) + # 逐sheet处理 + for sheet_name in xls.sheet_names: + level_offset = 1 + name = False + header = False + if sheet_name in sheet_name_black_list: + continue + print(f"正在处理工作表: {sheet_name}") + # 读取当前工作表到DataFrame + df = xls.parse(sheet_name) + doc.add_heading(sheet_name, 1 + level_offset) + initialized_header = False + + third_index = 1 + items_index = 1 + for index, row in df.iterrows(): + # 检查当前行的E列是否为空,为空则说明为新表,重置表名及表头标记 + if pd.isnull(row[4]): + level_offset = 1 + name = False + header = False + # 检查当前行的B列是否为空,为空说明整行为空要跳过 + if pd.isnull(row[1]): + continue + # 只要符合模式,新表第一行B列一定为表名 + if not name: + # 去除表名开头的“中文数字、” + table_name = str.split(row[1], '、', 1)[1] + name = True + if table_name == sheet_name: + level_offset -= 1 + continue + else: + doc.add_heading(table_name, 2 + level_offset) + continue + # 只要符合模式,新表第二行一定为表头 + if not header: + if not initialized_header: + initialized_header = True + headline.clear() + new_row = row[4:] + for cell in new_row: + headline.append(cell) + header = True + continue + # 只要符合模式,新表从第三行开始,非空的D列一定为子表名 + if not pd.isnull(row[3]): + if sheet_name == str(row[3]).replace(' ', '') or table_name == str(row[3]).replace(' ', ''): + level_offset -= 1 + else: + doc.add_heading(str(row[3]).replace(' ', ''), 3 + level_offset) + items_index = 1 + # 默认正常的E列(非表中间间隔)不为空,为空会出错 + doc.add_heading(row[4], 4 + level_offset) + for head, cell in zip(headline, row[4:]): + if head in column_black_list: + continue + doc.add_paragraph(head + ":" + str(cell).replace('\n', '').replace('nan', '无')) + items_index += 1 + # 文件另存为 + out_file_name = file.split(':')[1].split('_')[0] + doc.save(out_file_name + ".docx") + xls.close() +print("处理完成") diff --git a/excel2word2.py b/excel2word2.py new file mode 100644 index 0000000..237de15 --- /dev/null +++ b/excel2word2.py @@ -0,0 +1,80 @@ +import glob +import pandas as pd +from docx import Document + +# 目录级别后移,最小为0 +level_offset = 1 +name = False +header = False +headline = [] +# sheet页黑名单,当前所有表格中均不含退役军人项,该项为excel模版中带有的sheet页,如有例外需特殊处理。 +sheet_name_black_list = ['指标体系', '资源目录'] +column_black_list = ['期望更新周期'] + +# 获取全部excel表格 +xlsx_files = glob.glob('./附件2*.xlsx') + +# sheet_name_black_list = ['填表说明', '数据资源目录结构', '新增需求填报'] +# xlsx_files = ['附件2:自治区文化旅游主题共享资源库需求对接表_3932232206984868.xlsx'] + +for file in xlsx_files: + # 打开模板word文件 + doc = Document('temp.docx') + print(f"正在处理文件: {file}") + + # 读取Excel文件 + xls = pd.ExcelFile(file) + # 逐sheet处理 + for sheet_name in xls.sheet_names: + name = False + header = False + if sheet_name in sheet_name_black_list: + continue + print(f"正在处理工作表: {sheet_name}") + # 读取当前工作表到DataFrame + df = xls.parse(sheet_name) + doc.add_heading(sheet_name, 1 + level_offset) + initialized_header = False + + third_index = 1 + items_index = 1 + for index, row in df.iterrows(): + # 检查当前行的E列是否为空,为空则说明为新表,重置表名及表头标记 + if pd.isnull(row[4]): + name = False + header = False + # 检查当前行的B列是否为空,为空说明整行为空要跳过 + if pd.isnull(row[1]): + continue + # 只要符合模式,新表第一行B列一定为表名 + if not name: + # 去除表名开头的“中文数字、” + doc.add_heading(str.split(row[1], '、', 1)[1], 2 + level_offset) + name = True + continue + # 只要符合模式,新表第二行一定为表头 + if not header: + if not initialized_header: + initialized_header = True + headline.clear() + new_row = row[4:] + for cell in new_row: + headline.append(cell) + header = True + continue + # 只要符合模式,新表从第三行开始,非空的D列一定为子表名 + if not pd.isnull(row[3]): + doc.add_heading(str(row[3]).replace(' ', ''), 3 + level_offset) + items_index = 1 + # 默认正常的E列(非表中间间隔)不为空,为空会出错 + doc.add_heading(row[4], 4 + level_offset) + for head, cell in zip(headline, row[4:]): + if head in column_black_list: + continue + doc.add_paragraph(head + ":" + str(cell).replace('\n', '').replace('nan', '无')) + items_index += 1 + # 文件另存为 + out_file_name = file.split(':')[1].split('_')[0] + doc.save(out_file_name + ".docx") + xls.close() +print("处理完成") diff --git a/execl.py b/execl.py new file mode 100644 index 0000000..c4b8b86 --- /dev/null +++ b/execl.py @@ -0,0 +1,53 @@ +import glob +import pandas as pd + +name = False +header = False +headline = [] +sheet_name_black_list = ['填表说明', '数据资源目录结构', '新增需求填报'] + +xlsx_files = glob.glob('./附件2*.xlsx') +for file in xlsx_files: + print(f"正在处理文件: {file}") + out_file_name = file.split(':')[1].split('_')[0] + # 读取Excel文件 + xls = pd.ExcelFile(file) + # 打开一个文件用于写入 + with open(out_file_name + '.txt', 'w', encoding='utf-8') as file: + for sheet_name in xls.sheet_names: + name = False + header = False + if sheet_name in sheet_name_black_list: + continue + print(f"正在处理工作表: {sheet_name}") + # 读取当前工作表到DataFrame + df = xls.parse(sheet_name) + file.write(sheet_name + "\n") + initialized_header = False + for index, row in df.iterrows(): + # 检查当前行的B列是否为空 + if pd.isnull(row[1]): + file.write("\n\n") + name = False + header = False + continue + if not name: + file.write(row[1] + "\n") + name = True + continue + if not header: + if not initialized_header: + initialized_header = True + new_row = [row[1]] + new_row.extend(row[4:]) + for cell in new_row: + headline.append(cell) + header = True + continue + new_row = [row[1]] + new_row.extend(row[4:]) + for head, cell in zip(headline, new_row): + file.write(head + ":" + str(cell).replace('\n', '') + "\n") + file.write("\n") + xls.close() +print("处理完成") diff --git a/temp.docx b/temp.docx new file mode 100644 index 0000000..f89bdb8 Binary files /dev/null and b/temp.docx differ diff --git a/wordformat.py b/wordformat.py new file mode 100644 index 0000000..41446e2 --- /dev/null +++ b/wordformat.py @@ -0,0 +1,31 @@ +from docx import Document + +# 打开目标Word文档 +document = Document('数据库概要设计说明书.docx') +# 打开样板文件 +doc = Document('temp.docx') +database_index = 0 +table_index = 1 +empty_table = False + +# 遍历文档中的所有段落 +for paragraph in document.paragraphs: + if paragraph.text.startswith('数据库:'): + database_index += 1 + doc.add_heading(str(database_index) + "." + paragraph.text, 1) + table_index = 1 + elif paragraph.text.startswith('表名:'): + if len(paragraph.text) == 3: + empty_table = True + continue + else: + empty_table = False + doc.add_heading(str(database_index) + "." + str(table_index) + "." + paragraph.text, 2) + table_index += 1 + elif empty_table: + continue + else: + doc.add_paragraph(paragraph.text) + +# 保存修改后的文档 +doc.save('modified_document.docx')