You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
80 lines
3.1 KiB
80 lines
3.1 KiB
import glob
|
|
import pandas as pd
|
|
from docx import Document
|
|
|
|
# 目录级别后移,最小为0
|
|
level_offset = 1
|
|
name = False
|
|
header = False
|
|
headline = []
|
|
# sheet页黑名单,当前所有表格中均不含退役军人项,该项为excel模版中带有的sheet页,如有例外需特殊处理。
|
|
sheet_name_black_list = ['指标体系', '资源目录']
|
|
column_black_list = ['期望更新周期']
|
|
|
|
# 获取全部excel表格
|
|
xlsx_files = glob.glob('./附件2*.xlsx')
|
|
|
|
# sheet_name_black_list = ['填表说明', '数据资源目录结构', '新增需求填报']
|
|
# xlsx_files = ['附件2:自治区文化旅游主题共享资源库需求对接表_3932232206984868.xlsx']
|
|
|
|
for file in xlsx_files:
|
|
# 打开模板word文件
|
|
doc = Document('temp.docx')
|
|
print(f"正在处理文件: {file}")
|
|
|
|
# 读取Excel文件
|
|
xls = pd.ExcelFile(file)
|
|
# 逐sheet处理
|
|
for sheet_name in xls.sheet_names:
|
|
name = False
|
|
header = False
|
|
if sheet_name in sheet_name_black_list:
|
|
continue
|
|
print(f"正在处理工作表: {sheet_name}")
|
|
# 读取当前工作表到DataFrame
|
|
df = xls.parse(sheet_name)
|
|
doc.add_heading(sheet_name, 1 + level_offset)
|
|
initialized_header = False
|
|
|
|
third_index = 1
|
|
items_index = 1
|
|
for index, row in df.iterrows():
|
|
# 检查当前行的E列是否为空,为空则说明为新表,重置表名及表头标记
|
|
if pd.isnull(row[4]):
|
|
name = False
|
|
header = False
|
|
# 检查当前行的B列是否为空,为空说明整行为空要跳过
|
|
if pd.isnull(row[1]):
|
|
continue
|
|
# 只要符合模式,新表第一行B列一定为表名
|
|
if not name:
|
|
# 去除表名开头的“中文数字、”
|
|
doc.add_heading(str.split(row[1], '、', 1)[1], 2 + level_offset)
|
|
name = True
|
|
continue
|
|
# 只要符合模式,新表第二行一定为表头
|
|
if not header:
|
|
if not initialized_header:
|
|
initialized_header = True
|
|
headline.clear()
|
|
new_row = row[4:]
|
|
for cell in new_row:
|
|
headline.append(cell)
|
|
header = True
|
|
continue
|
|
# 只要符合模式,新表从第三行开始,非空的D列一定为子表名
|
|
if not pd.isnull(row[3]):
|
|
doc.add_heading(str(row[3]).replace(' ', ''), 3 + level_offset)
|
|
items_index = 1
|
|
# 默认正常的E列(非表中间间隔)不为空,为空会出错
|
|
doc.add_heading(row[4], 4 + level_offset)
|
|
for head, cell in zip(headline, row[4:]):
|
|
if head in column_black_list:
|
|
continue
|
|
doc.add_paragraph(head + ":" + str(cell).replace('\n', '').replace('nan', '无'))
|
|
items_index += 1
|
|
# 文件另存为
|
|
out_file_name = file.split(':')[1].split('_')[0]
|
|
doc.save(out_file_name + ".docx")
|
|
xls.close()
|
|
print("处理完成")
|
|
|