0%

Python 解析 PDF 转换为csv

pdfplumber 这个库相对另外几个 解析出来的更容易理解,返回list,还可以将PDF中的表格形式返回给console

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# -*- coding: utf-8 -*-
import pdfplumber
import datetime
import sys
import os


# 交互
try:
pdfName = input("输入PDF文件名:")
with pdfplumber.open(pdfName) as pdf:
first_page = pdf.pages[0]
print(len(pdf.pages))
i = 1
data = []
for page in pdf.pages:
for table in page.extract_tables():
if i % 2 == 0:
# print(i, table[1])
print(i, table[1][1])
# 判断qty是否是纯数字
if table[1][1].isdigit():
print('单sku和多数量的订单', table[1][0], table[1][1])
for x in range(int(table[1][1])):
data.append(table[1][0])
else:
# qty不是纯数字,多行文本会用\n链接
print('多个sku多数量的订单', table[1][0], table[1][1])
skuList = table[1][0].split('\n')
qtyList = table[1][1].split('\n')
# ['WF-DC-TAUPE-T', 'WF-DC-GREY-T'] ['2', '1']
print(skuList, qtyList)
for s in range(len(skuList)):
for q in qtyList[s]:
data.append(skuList[s])
i = i + 1
print(data)
with open("result.csv", "w", newline='\r\n') as file:
file.write('SKU' + '\n')
for v in data:
print(v)
file.write(v + '\n')
except Exception as valueError:
print('pdf文件错误:' + valueError)
# windows
os.system('pause')