pythonScript/excel2word2.py


								import glob

								import pandas as pd

								from docx import Document


								# 目录级别后移，最小为0

								level_offset = 1

								name = False

								header = False

								headline = []

								# sheet页黑名单，当前所有表格中均不含退役军人项，该项为excel模版中带有的sheet页，如有例外需特殊处理。

								sheet_name_black_list = ['指标体系', '资源目录']

								column_black_list = ['期望更新周期']


								# 获取全部excel表格

								xlsx_files = glob.glob('./附件2*.xlsx')


								# sheet_name_black_list = ['填表说明', '数据资源目录结构', '新增需求填报']

								# xlsx_files = ['附件2：自治区文化旅游主题共享资源库需求对接表_3932232206984868.xlsx']


								for file in xlsx_files:

								    # 打开模板word文件

								    doc = Document('temp.docx')

								    print(f"正在处理文件: {file}")


								    # 读取Excel文件

								    xls = pd.ExcelFile(file)

								    # 逐sheet处理

								    for sheet_name in xls.sheet_names:

								        name = False

								        header = False

								        if sheet_name in sheet_name_black_list:

								            continue

								        print(f"正在处理工作表: {sheet_name}")

								        # 读取当前工作表到DataFrame

								        df = xls.parse(sheet_name)

								        doc.add_heading(sheet_name, 1 + level_offset)

								        initialized_header = False


								        third_index = 1

								        items_index = 1

								        for index, row in df.iterrows():

								            # 检查当前行的E列是否为空，为空则说明为新表，重置表名及表头标记

								            if pd.isnull(row[4]):

								                name = False

								                header = False

								            # 检查当前行的B列是否为空，为空说明整行为空要跳过

								            if pd.isnull(row[1]):

								                continue

								            # 只要符合模式，新表第一行B列一定为表名

								            if not name:

								                # 去除表名开头的“中文数字、”

								                doc.add_heading(str.split(row[1], '、', 1)[1], 2 + level_offset)

								                name = True

								                continue

								            # 只要符合模式，新表第二行一定为表头

								            if not header:

								                if not initialized_header:

								                    initialized_header = True

								                    headline.clear()

								                    new_row = row[4:]

								                    for cell in new_row:

								                        headline.append(cell)

								                header = True

								                continue

								            # 只要符合模式，新表从第三行开始，非空的D列一定为子表名

								            if not pd.isnull(row[3]):

								                doc.add_heading(str(row[3]).replace(' ', ''), 3 + level_offset)

								                items_index = 1

								            # 默认正常的E列（非表中间间隔）不为空，为空会出错

								            doc.add_heading(row[4], 4 + level_offset)

								            for head, cell in zip(headline, row[4:]):

								                if head in column_black_list:

								                    continue

								                doc.add_paragraph(head + "：" + str(cell).replace('\n', '').replace('nan', '无'))

								            items_index += 1

								    # 文件另存为

								    out_file_name = file.split('：')[1].split('_')[0]

								    doc.save(out_file_name + ".docx")

								    xls.close()

								print("处理完成")