Browse Source

init

master
liuqingwen 9 months ago
commit
7ea977268a
  1. 90
      excel2word.py
  2. 80
      excel2word2.py
  3. 53
      execl.py
  4. BIN
      temp.docx
  5. 31
      wordformat.py

90
excel2word.py

@ -0,0 +1,90 @@
import glob
import pandas as pd
from docx import Document
# 目录级别后移,最小为0
level_offset = 1
name = False
header = False
headline = []
# sheet页黑名单,当前所有表格中均不含退役军人项,该项为excel模版中带有的sheet页,如有例外需特殊处理。
sheet_name_black_list = ['填表说明', '数据资源目录结构', '新增需求填报', '退役军人']
column_black_list = ['期望更新周期']
# 获取全部excel表格
# xlsx_files = glob.glob('./附件2*.xlsx')
# sheet_name_black_list = ['填表说明', '数据资源目录结构', '新增需求填报']
xlsx_files = ['附件2:自治区文化旅游主题共享资源库需求对接表_3932232206984868.xlsx']
for file in xlsx_files:
# 打开模板word文件
doc = Document('temp.docx')
print(f"正在处理文件: {file}")
# 读取Excel文件
xls = pd.ExcelFile(file)
# 逐sheet处理
for sheet_name in xls.sheet_names:
level_offset = 1
name = False
header = False
if sheet_name in sheet_name_black_list:
continue
print(f"正在处理工作表: {sheet_name}")
# 读取当前工作表到DataFrame
df = xls.parse(sheet_name)
doc.add_heading(sheet_name, 1 + level_offset)
initialized_header = False
third_index = 1
items_index = 1
for index, row in df.iterrows():
# 检查当前行的E列是否为空,为空则说明为新表,重置表名及表头标记
if pd.isnull(row[4]):
level_offset = 1
name = False
header = False
# 检查当前行的B列是否为空,为空说明整行为空要跳过
if pd.isnull(row[1]):
continue
# 只要符合模式,新表第一行B列一定为表名
if not name:
# 去除表名开头的“中文数字、”
table_name = str.split(row[1], '', 1)[1]
name = True
if table_name == sheet_name:
level_offset -= 1
continue
else:
doc.add_heading(table_name, 2 + level_offset)
continue
# 只要符合模式,新表第二行一定为表头
if not header:
if not initialized_header:
initialized_header = True
headline.clear()
new_row = row[4:]
for cell in new_row:
headline.append(cell)
header = True
continue
# 只要符合模式,新表从第三行开始,非空的D列一定为子表名
if not pd.isnull(row[3]):
if sheet_name == str(row[3]).replace(' ', '') or table_name == str(row[3]).replace(' ', ''):
level_offset -= 1
else:
doc.add_heading(str(row[3]).replace(' ', ''), 3 + level_offset)
items_index = 1
# 默认正常的E列(非表中间间隔)不为空,为空会出错
doc.add_heading(row[4], 4 + level_offset)
for head, cell in zip(headline, row[4:]):
if head in column_black_list:
continue
doc.add_paragraph(head + "" + str(cell).replace('\n', '').replace('nan', ''))
items_index += 1
# 文件另存为
out_file_name = file.split('')[1].split('_')[0]
doc.save(out_file_name + ".docx")
xls.close()
print("处理完成")

80
excel2word2.py

@ -0,0 +1,80 @@
import glob
import pandas as pd
from docx import Document
# 目录级别后移,最小为0
level_offset = 1
name = False
header = False
headline = []
# sheet页黑名单,当前所有表格中均不含退役军人项,该项为excel模版中带有的sheet页,如有例外需特殊处理。
sheet_name_black_list = ['指标体系', '资源目录']
column_black_list = ['期望更新周期']
# 获取全部excel表格
xlsx_files = glob.glob('./附件2*.xlsx')
# sheet_name_black_list = ['填表说明', '数据资源目录结构', '新增需求填报']
# xlsx_files = ['附件2:自治区文化旅游主题共享资源库需求对接表_3932232206984868.xlsx']
for file in xlsx_files:
# 打开模板word文件
doc = Document('temp.docx')
print(f"正在处理文件: {file}")
# 读取Excel文件
xls = pd.ExcelFile(file)
# 逐sheet处理
for sheet_name in xls.sheet_names:
name = False
header = False
if sheet_name in sheet_name_black_list:
continue
print(f"正在处理工作表: {sheet_name}")
# 读取当前工作表到DataFrame
df = xls.parse(sheet_name)
doc.add_heading(sheet_name, 1 + level_offset)
initialized_header = False
third_index = 1
items_index = 1
for index, row in df.iterrows():
# 检查当前行的E列是否为空,为空则说明为新表,重置表名及表头标记
if pd.isnull(row[4]):
name = False
header = False
# 检查当前行的B列是否为空,为空说明整行为空要跳过
if pd.isnull(row[1]):
continue
# 只要符合模式,新表第一行B列一定为表名
if not name:
# 去除表名开头的“中文数字、”
doc.add_heading(str.split(row[1], '', 1)[1], 2 + level_offset)
name = True
continue
# 只要符合模式,新表第二行一定为表头
if not header:
if not initialized_header:
initialized_header = True
headline.clear()
new_row = row[4:]
for cell in new_row:
headline.append(cell)
header = True
continue
# 只要符合模式,新表从第三行开始,非空的D列一定为子表名
if not pd.isnull(row[3]):
doc.add_heading(str(row[3]).replace(' ', ''), 3 + level_offset)
items_index = 1
# 默认正常的E列(非表中间间隔)不为空,为空会出错
doc.add_heading(row[4], 4 + level_offset)
for head, cell in zip(headline, row[4:]):
if head in column_black_list:
continue
doc.add_paragraph(head + "" + str(cell).replace('\n', '').replace('nan', ''))
items_index += 1
# 文件另存为
out_file_name = file.split('')[1].split('_')[0]
doc.save(out_file_name + ".docx")
xls.close()
print("处理完成")

53
execl.py

@ -0,0 +1,53 @@
import glob
import pandas as pd
name = False
header = False
headline = []
sheet_name_black_list = ['填表说明', '数据资源目录结构', '新增需求填报']
xlsx_files = glob.glob('./附件2*.xlsx')
for file in xlsx_files:
print(f"正在处理文件: {file}")
out_file_name = file.split('')[1].split('_')[0]
# 读取Excel文件
xls = pd.ExcelFile(file)
# 打开一个文件用于写入
with open(out_file_name + '.txt', 'w', encoding='utf-8') as file:
for sheet_name in xls.sheet_names:
name = False
header = False
if sheet_name in sheet_name_black_list:
continue
print(f"正在处理工作表: {sheet_name}")
# 读取当前工作表到DataFrame
df = xls.parse(sheet_name)
file.write(sheet_name + "\n")
initialized_header = False
for index, row in df.iterrows():
# 检查当前行的B列是否为空
if pd.isnull(row[1]):
file.write("\n\n")
name = False
header = False
continue
if not name:
file.write(row[1] + "\n")
name = True
continue
if not header:
if not initialized_header:
initialized_header = True
new_row = [row[1]]
new_row.extend(row[4:])
for cell in new_row:
headline.append(cell)
header = True
continue
new_row = [row[1]]
new_row.extend(row[4:])
for head, cell in zip(headline, new_row):
file.write(head + ":" + str(cell).replace('\n', '') + "\n")
file.write("\n")
xls.close()
print("处理完成")

BIN
temp.docx

Binary file not shown.

31
wordformat.py

@ -0,0 +1,31 @@
from docx import Document
# 打开目标Word文档
document = Document('数据库概要设计说明书.docx')
# 打开样板文件
doc = Document('temp.docx')
database_index = 0
table_index = 1
empty_table = False
# 遍历文档中的所有段落
for paragraph in document.paragraphs:
if paragraph.text.startswith('数据库:'):
database_index += 1
doc.add_heading(str(database_index) + "." + paragraph.text, 1)
table_index = 1
elif paragraph.text.startswith('表名:'):
if len(paragraph.text) == 3:
empty_table = True
continue
else:
empty_table = False
doc.add_heading(str(database_index) + "." + str(table_index) + "." + paragraph.text, 2)
table_index += 1
elif empty_table:
continue
else:
doc.add_paragraph(paragraph.text)
# 保存修改后的文档
doc.save('modified_document.docx')
Loading…
Cancel
Save