commit
7ea977268a
5 changed files with 254 additions and 0 deletions
@ -0,0 +1,90 @@ |
|||||
|
import glob |
||||
|
import pandas as pd |
||||
|
from docx import Document |
||||
|
|
||||
|
# 目录级别后移,最小为0 |
||||
|
level_offset = 1 |
||||
|
name = False |
||||
|
header = False |
||||
|
headline = [] |
||||
|
# sheet页黑名单,当前所有表格中均不含退役军人项,该项为excel模版中带有的sheet页,如有例外需特殊处理。 |
||||
|
sheet_name_black_list = ['填表说明', '数据资源目录结构', '新增需求填报', '退役军人'] |
||||
|
column_black_list = ['期望更新周期'] |
||||
|
|
||||
|
# 获取全部excel表格 |
||||
|
# xlsx_files = glob.glob('./附件2*.xlsx') |
||||
|
|
||||
|
# sheet_name_black_list = ['填表说明', '数据资源目录结构', '新增需求填报'] |
||||
|
xlsx_files = ['附件2:自治区文化旅游主题共享资源库需求对接表_3932232206984868.xlsx'] |
||||
|
|
||||
|
for file in xlsx_files: |
||||
|
# 打开模板word文件 |
||||
|
doc = Document('temp.docx') |
||||
|
print(f"正在处理文件: {file}") |
||||
|
|
||||
|
# 读取Excel文件 |
||||
|
xls = pd.ExcelFile(file) |
||||
|
# 逐sheet处理 |
||||
|
for sheet_name in xls.sheet_names: |
||||
|
level_offset = 1 |
||||
|
name = False |
||||
|
header = False |
||||
|
if sheet_name in sheet_name_black_list: |
||||
|
continue |
||||
|
print(f"正在处理工作表: {sheet_name}") |
||||
|
# 读取当前工作表到DataFrame |
||||
|
df = xls.parse(sheet_name) |
||||
|
doc.add_heading(sheet_name, 1 + level_offset) |
||||
|
initialized_header = False |
||||
|
|
||||
|
third_index = 1 |
||||
|
items_index = 1 |
||||
|
for index, row in df.iterrows(): |
||||
|
# 检查当前行的E列是否为空,为空则说明为新表,重置表名及表头标记 |
||||
|
if pd.isnull(row[4]): |
||||
|
level_offset = 1 |
||||
|
name = False |
||||
|
header = False |
||||
|
# 检查当前行的B列是否为空,为空说明整行为空要跳过 |
||||
|
if pd.isnull(row[1]): |
||||
|
continue |
||||
|
# 只要符合模式,新表第一行B列一定为表名 |
||||
|
if not name: |
||||
|
# 去除表名开头的“中文数字、” |
||||
|
table_name = str.split(row[1], '、', 1)[1] |
||||
|
name = True |
||||
|
if table_name == sheet_name: |
||||
|
level_offset -= 1 |
||||
|
continue |
||||
|
else: |
||||
|
doc.add_heading(table_name, 2 + level_offset) |
||||
|
continue |
||||
|
# 只要符合模式,新表第二行一定为表头 |
||||
|
if not header: |
||||
|
if not initialized_header: |
||||
|
initialized_header = True |
||||
|
headline.clear() |
||||
|
new_row = row[4:] |
||||
|
for cell in new_row: |
||||
|
headline.append(cell) |
||||
|
header = True |
||||
|
continue |
||||
|
# 只要符合模式,新表从第三行开始,非空的D列一定为子表名 |
||||
|
if not pd.isnull(row[3]): |
||||
|
if sheet_name == str(row[3]).replace(' ', '') or table_name == str(row[3]).replace(' ', ''): |
||||
|
level_offset -= 1 |
||||
|
else: |
||||
|
doc.add_heading(str(row[3]).replace(' ', ''), 3 + level_offset) |
||||
|
items_index = 1 |
||||
|
# 默认正常的E列(非表中间间隔)不为空,为空会出错 |
||||
|
doc.add_heading(row[4], 4 + level_offset) |
||||
|
for head, cell in zip(headline, row[4:]): |
||||
|
if head in column_black_list: |
||||
|
continue |
||||
|
doc.add_paragraph(head + ":" + str(cell).replace('\n', '').replace('nan', '无')) |
||||
|
items_index += 1 |
||||
|
# 文件另存为 |
||||
|
out_file_name = file.split(':')[1].split('_')[0] |
||||
|
doc.save(out_file_name + ".docx") |
||||
|
xls.close() |
||||
|
print("处理完成") |
@ -0,0 +1,80 @@ |
|||||
|
import glob |
||||
|
import pandas as pd |
||||
|
from docx import Document |
||||
|
|
||||
|
# 目录级别后移,最小为0 |
||||
|
level_offset = 1 |
||||
|
name = False |
||||
|
header = False |
||||
|
headline = [] |
||||
|
# sheet页黑名单,当前所有表格中均不含退役军人项,该项为excel模版中带有的sheet页,如有例外需特殊处理。 |
||||
|
sheet_name_black_list = ['指标体系', '资源目录'] |
||||
|
column_black_list = ['期望更新周期'] |
||||
|
|
||||
|
# 获取全部excel表格 |
||||
|
xlsx_files = glob.glob('./附件2*.xlsx') |
||||
|
|
||||
|
# sheet_name_black_list = ['填表说明', '数据资源目录结构', '新增需求填报'] |
||||
|
# xlsx_files = ['附件2:自治区文化旅游主题共享资源库需求对接表_3932232206984868.xlsx'] |
||||
|
|
||||
|
for file in xlsx_files: |
||||
|
# 打开模板word文件 |
||||
|
doc = Document('temp.docx') |
||||
|
print(f"正在处理文件: {file}") |
||||
|
|
||||
|
# 读取Excel文件 |
||||
|
xls = pd.ExcelFile(file) |
||||
|
# 逐sheet处理 |
||||
|
for sheet_name in xls.sheet_names: |
||||
|
name = False |
||||
|
header = False |
||||
|
if sheet_name in sheet_name_black_list: |
||||
|
continue |
||||
|
print(f"正在处理工作表: {sheet_name}") |
||||
|
# 读取当前工作表到DataFrame |
||||
|
df = xls.parse(sheet_name) |
||||
|
doc.add_heading(sheet_name, 1 + level_offset) |
||||
|
initialized_header = False |
||||
|
|
||||
|
third_index = 1 |
||||
|
items_index = 1 |
||||
|
for index, row in df.iterrows(): |
||||
|
# 检查当前行的E列是否为空,为空则说明为新表,重置表名及表头标记 |
||||
|
if pd.isnull(row[4]): |
||||
|
name = False |
||||
|
header = False |
||||
|
# 检查当前行的B列是否为空,为空说明整行为空要跳过 |
||||
|
if pd.isnull(row[1]): |
||||
|
continue |
||||
|
# 只要符合模式,新表第一行B列一定为表名 |
||||
|
if not name: |
||||
|
# 去除表名开头的“中文数字、” |
||||
|
doc.add_heading(str.split(row[1], '、', 1)[1], 2 + level_offset) |
||||
|
name = True |
||||
|
continue |
||||
|
# 只要符合模式,新表第二行一定为表头 |
||||
|
if not header: |
||||
|
if not initialized_header: |
||||
|
initialized_header = True |
||||
|
headline.clear() |
||||
|
new_row = row[4:] |
||||
|
for cell in new_row: |
||||
|
headline.append(cell) |
||||
|
header = True |
||||
|
continue |
||||
|
# 只要符合模式,新表从第三行开始,非空的D列一定为子表名 |
||||
|
if not pd.isnull(row[3]): |
||||
|
doc.add_heading(str(row[3]).replace(' ', ''), 3 + level_offset) |
||||
|
items_index = 1 |
||||
|
# 默认正常的E列(非表中间间隔)不为空,为空会出错 |
||||
|
doc.add_heading(row[4], 4 + level_offset) |
||||
|
for head, cell in zip(headline, row[4:]): |
||||
|
if head in column_black_list: |
||||
|
continue |
||||
|
doc.add_paragraph(head + ":" + str(cell).replace('\n', '').replace('nan', '无')) |
||||
|
items_index += 1 |
||||
|
# 文件另存为 |
||||
|
out_file_name = file.split(':')[1].split('_')[0] |
||||
|
doc.save(out_file_name + ".docx") |
||||
|
xls.close() |
||||
|
print("处理完成") |
@ -0,0 +1,53 @@ |
|||||
|
import glob |
||||
|
import pandas as pd |
||||
|
|
||||
|
name = False |
||||
|
header = False |
||||
|
headline = [] |
||||
|
sheet_name_black_list = ['填表说明', '数据资源目录结构', '新增需求填报'] |
||||
|
|
||||
|
xlsx_files = glob.glob('./附件2*.xlsx') |
||||
|
for file in xlsx_files: |
||||
|
print(f"正在处理文件: {file}") |
||||
|
out_file_name = file.split(':')[1].split('_')[0] |
||||
|
# 读取Excel文件 |
||||
|
xls = pd.ExcelFile(file) |
||||
|
# 打开一个文件用于写入 |
||||
|
with open(out_file_name + '.txt', 'w', encoding='utf-8') as file: |
||||
|
for sheet_name in xls.sheet_names: |
||||
|
name = False |
||||
|
header = False |
||||
|
if sheet_name in sheet_name_black_list: |
||||
|
continue |
||||
|
print(f"正在处理工作表: {sheet_name}") |
||||
|
# 读取当前工作表到DataFrame |
||||
|
df = xls.parse(sheet_name) |
||||
|
file.write(sheet_name + "\n") |
||||
|
initialized_header = False |
||||
|
for index, row in df.iterrows(): |
||||
|
# 检查当前行的B列是否为空 |
||||
|
if pd.isnull(row[1]): |
||||
|
file.write("\n\n") |
||||
|
name = False |
||||
|
header = False |
||||
|
continue |
||||
|
if not name: |
||||
|
file.write(row[1] + "\n") |
||||
|
name = True |
||||
|
continue |
||||
|
if not header: |
||||
|
if not initialized_header: |
||||
|
initialized_header = True |
||||
|
new_row = [row[1]] |
||||
|
new_row.extend(row[4:]) |
||||
|
for cell in new_row: |
||||
|
headline.append(cell) |
||||
|
header = True |
||||
|
continue |
||||
|
new_row = [row[1]] |
||||
|
new_row.extend(row[4:]) |
||||
|
for head, cell in zip(headline, new_row): |
||||
|
file.write(head + ":" + str(cell).replace('\n', '') + "\n") |
||||
|
file.write("\n") |
||||
|
xls.close() |
||||
|
print("处理完成") |
Binary file not shown.
@ -0,0 +1,31 @@ |
|||||
|
from docx import Document |
||||
|
|
||||
|
# 打开目标Word文档 |
||||
|
document = Document('数据库概要设计说明书.docx') |
||||
|
# 打开样板文件 |
||||
|
doc = Document('temp.docx') |
||||
|
database_index = 0 |
||||
|
table_index = 1 |
||||
|
empty_table = False |
||||
|
|
||||
|
# 遍历文档中的所有段落 |
||||
|
for paragraph in document.paragraphs: |
||||
|
if paragraph.text.startswith('数据库:'): |
||||
|
database_index += 1 |
||||
|
doc.add_heading(str(database_index) + "." + paragraph.text, 1) |
||||
|
table_index = 1 |
||||
|
elif paragraph.text.startswith('表名:'): |
||||
|
if len(paragraph.text) == 3: |
||||
|
empty_table = True |
||||
|
continue |
||||
|
else: |
||||
|
empty_table = False |
||||
|
doc.add_heading(str(database_index) + "." + str(table_index) + "." + paragraph.text, 2) |
||||
|
table_index += 1 |
||||
|
elif empty_table: |
||||
|
continue |
||||
|
else: |
||||
|
doc.add_paragraph(paragraph.text) |
||||
|
|
||||
|
# 保存修改后的文档 |
||||
|
doc.save('modified_document.docx') |
Loading…
Reference in new issue