Pdf To Structured技能使用说明
2026-03-29
新闻来源:网淘吧
围观:17
电脑广告
手机广告
PDF转结构化数据
概述
基于DDC方法(第2.4章),此技能可将非结构化的PDF文档转换为适合分析和集成的结构化格式。建筑项目会产生大量的PDF文档——如技术规格、物料清单、进度表和报告——这些都需要被提取和处理。
参考书籍:《数据向结构化形式的转换》/ "Data Transformation to Structured Form"
"将数据从非结构化形式转换为结构化形式,既是艺术,也是科学。这个过程通常占据了数据工程师工作的很大一部分。" —— DDC书籍,第2.4章
ETL流程概述
转换遵循ETL模式:
- 提取:加载PDF文档
- 转换:解析并结构化内容
- 加载:保存为CSV、Excel或JSON格式
快速开始
import pdfplumber
import pandas as pd
# Extract table from PDF
with pdfplumber.open("construction_spec.pdf") as pdf:
page = pdf.pages[0]
table = page.extract_table()
df = pd.DataFrame(table[1:], columns=table[0])
df.to_excel("extracted_data.xlsx", index=False)
安装
# Core libraries
pip install pdfplumber pandas openpyxl
# For scanned PDFs (OCR)
pip install pytesseract pdf2image
# Also install Tesseract OCR: https://github.com/tesseract-ocr/tesseract
# For advanced PDF operations
pip install pypdf
原生PDF提取 (pdfplumber)
从PDF中提取所有表格
import pdfplumber
import pandas as pd
def extract_tables_from_pdf(pdf_path):
"""Extract all tables from a PDF file"""
all_tables = []
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages):
tables = page.extract_tables()
for table_num, table in enumerate(tables):
if table and len(table) > 1:
# First row as header
df = pd.DataFrame(table[1:], columns=table[0])
df['_page'] = page_num + 1
df['_table'] = table_num + 1
all_tables.append(df)
if all_tables:
return pd.concat(all_tables, ignore_index=True)
return pd.DataFrame()
# Usage
df = extract_tables_from_pdf("material_specification.pdf")
df.to_excel("materials.xlsx", index=False)
提取带布局的文本
import pdfplumber
def extract_text_with_layout(pdf_path):
"""Extract text preserving layout structure"""
full_text = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
full_text.append(text)
return "\n\n--- Page Break ---\n\n".join(full_text)
# Usage
text = extract_text_with_layout("project_report.pdf")
with open("report_text.txt", "w", encoding="utf-8") as f:
f.write(text)
按位置提取特定表格
import pdfplumber
import pandas as pd
def extract_table_from_area(pdf_path, page_num, bbox):
"""
Extract table from specific area on page
Args:
pdf_path: Path to PDF file
page_num: Page number (0-indexed)
bbox: Bounding box (x0, top, x1, bottom) in points
"""
with pdfplumber.open(pdf_path) as pdf:
page = pdf.pages[page_num]
cropped = page.within_bbox(bbox)
table = cropped.extract_table()
if table:
return pd.DataFrame(table[1:], columns=table[0])
return pd.DataFrame()
# Usage - extract table from specific area
# bbox format: (left, top, right, bottom) in points (1 inch = 72 points)
df = extract_table_from_area("drawing.pdf", 0, (50, 100, 550, 400))
扫描PDF处理(OCR)
从扫描PDF中提取文本
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
def ocr_scanned_pdf(pdf_path, language='eng'):
"""
Extract text from scanned PDF using OCR
Args:
pdf_path: Path to scanned PDF
language: Tesseract language code (eng, deu, rus, etc.)
"""
# Convert PDF pages to images
images = convert_from_path(pdf_path, dpi=300)
extracted_text = []
for i, image in enumerate(images):
text = pytesseract.image_to_string(image, lang=language)
extracted_text.append({
'page': i + 1,
'text': text
})
return pd.DataFrame(extracted_text)
# Usage
df = ocr_scanned_pdf("scanned_specification.pdf", language='eng')
df.to_csv("ocr_results.csv", index=False)
OCR表格提取
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import cv2
import numpy as np
def ocr_table_from_scanned_pdf(pdf_path, page_num=0):
"""Extract table from scanned PDF using OCR with table detection"""
# Convert specific page to image
images = convert_from_path(pdf_path, first_page=page_num+1,
last_page=page_num+1, dpi=300)
image = np.array(images[0])
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
# Apply thresholding
_, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
# Extract text with table structure
custom_config = r'--oem 3 --psm 6'
text = pytesseract.image_to_string(gray, config=custom_config)
# Parse text into table structure
lines = text.strip().split('\n')
data = [line.split() for line in lines if line.strip()]
if data:
# Assume first row is header
df = pd.DataFrame(data[1:], columns=data[0] if len(data[0]) > 0 else None)
return df
return pd.DataFrame()
# Usage
df = ocr_table_from_scanned_pdf("scanned_bom.pdf")
print(df)
特定于建筑行业的提取
物料清单(BOM)提取
import pdfplumber
import pandas as pd
import re
def extract_bom_from_pdf(pdf_path):
"""Extract Bill of Materials from construction PDF"""
all_items = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
if not table or len(table) < 2:
continue
# Find header row (look for common BOM headers)
header_keywords = ['item', 'description', 'quantity', 'unit', 'material']
for i, row in enumerate(table):
if row and any(keyword in str(row).lower() for keyword in header_keywords):
# Found header, process remaining rows
headers = [str(h).strip() for h in row]
for data_row in table[i+1:]:
if data_row and any(cell for cell in data_row if cell):
item = dict(zip(headers, data_row))
all_items.append(item)
break
return pd.DataFrame(all_items)
# Usage
bom = extract_bom_from_pdf("project_bom.pdf")
bom.to_excel("bom_extracted.xlsx", index=False)
项目进度表提取
import pdfplumber
import pandas as pd
from datetime import datetime
def extract_schedule_from_pdf(pdf_path):
"""Extract project schedule/gantt data from PDF"""
with pdfplumber.open(pdf_path) as pdf:
all_tasks = []
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
if not table:
continue
# Look for schedule-like table
headers = table[0] if table else []
# Check if it looks like a schedule
schedule_keywords = ['task', 'activity', 'start', 'end', 'duration']
if any(kw in str(headers).lower() for kw in schedule_keywords):
for row in table[1:]:
if row and any(cell for cell in row if cell):
task = dict(zip(headers, row))
all_tasks.append(task)
df = pd.DataFrame(all_tasks)
# Try to parse dates
date_columns = ['Start', 'End', 'Start Date', 'End Date', 'Finish']
for col in date_columns:
if col in df.columns:
df[col] = pd.to_datetime(df[col], errors='coerce')
return df
# Usage
schedule = extract_schedule_from_pdf("project_schedule.pdf")
print(schedule)
规范解析
import pdfplumber
import pandas as pd
import re
def parse_specification_pdf(pdf_path):
"""Parse construction specification document"""
specs = []
with pdfplumber.open(pdf_path) as pdf:
full_text = ""
for page in pdf.pages:
text = page.extract_text()
if text:
full_text += text + "\n"
# Parse sections (common spec format)
section_pattern = r'(\d+\.\d+(?:\.\d+)?)\s+([A-Z][^\n]+)'
sections = re.findall(section_pattern, full_text)
for num, title in sections:
specs.append({
'section_number': num,
'title': title.strip(),
'level': len(num.split('.'))
})
return pd.DataFrame(specs)
# Usage
specs = parse_specification_pdf("technical_spec.pdf")
print(specs)
批处理
处理多个PDF文件
import pdfplumber
import pandas as pd
from pathlib import Path
def batch_extract_tables(folder_path, output_folder):
"""Process all PDFs in folder and extract tables"""
pdf_files = Path(folder_path).glob("*.pdf")
results = []
for pdf_path in pdf_files:
print(f"Processing: {pdf_path.name}")
try:
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages):
tables = page.extract_tables()
for table_num, table in enumerate(tables):
if table and len(table) > 1:
df = pd.DataFrame(table[1:], columns=table[0])
df['_source_file'] = pdf_path.name
df['_page'] = page_num + 1
# Save individual table
output_name = f"{pdf_path.stem}_p{page_num+1}_t{table_num+1}.xlsx"
df.to_excel(Path(output_folder) / output_name, index=False)
results.append(df)
except Exception as e:
print(f"Error processing {pdf_path.name}: {e}")
# Combined output
if results:
combined = pd.concat(results, ignore_index=True)
combined.to_excel(Path(output_folder) / "all_tables.xlsx", index=False)
return len(results)
# Usage
count = batch_extract_tables("./pdf_documents/", "./extracted/")
print(f"Extracted {count} tables")
提取后的数据清洗
import pandas as pd
def clean_extracted_data(df):
"""Clean common issues in PDF-extracted data"""
# Remove completely empty rows
df = df.dropna(how='all')
# Strip whitespace from string columns
for col in df.select_dtypes(include=['object']).columns:
df[col] = df[col].str.strip()
# Remove rows where all cells are empty strings
df = df[df.apply(lambda row: any(cell != '' for cell in row), axis=1)]
# Convert numeric columns
for col in df.columns:
# Try to convert to numeric
numeric_series = pd.to_numeric(df[col], errors='coerce')
if numeric_series.notna().sum() > len(df) * 0.5: # More than 50% numeric
df[col] = numeric_series
return df
# Usage
df = extract_tables_from_pdf("document.pdf")
df_clean = clean_extracted_data(df)
df_clean.to_excel("clean_data.xlsx", index=False)
导出选项
import pandas as pd
import json
def export_to_multiple_formats(df, base_name):
"""Export DataFrame to multiple formats"""
# Excel
df.to_excel(f"{base_name}.xlsx", index=False)
# CSV
df.to_csv(f"{base_name}.csv", index=False, encoding='utf-8-sig')
# JSON
df.to_json(f"{base_name}.json", orient='records', indent=2)
# JSON Lines (for large datasets)
df.to_json(f"{base_name}.jsonl", orient='records', lines=True)
# Usage
df = extract_tables_from_pdf("document.pdf")
export_to_multiple_formats(df, "extracted_data")
快速参考
| 任务 | 工具 | 代码 |
|---|---|---|
| 提取表格 | pdfplumber | page.extract_table() |
| 提取文本 | pdfplumber | page.extract_text() |
| OCR扫描 | pytesseract | pytesseract.image_to_string(image) |
| 合并PDF | pypdf | writer.add_page(page) |
| 转换为图像 | pdf2image | convert_from_path(pdf) |
故障排除
| 问题 | 解决方案 |
|---|---|
| 表格未检测到 | 尝试调整表格设置:page.extract_table(table_settings={}) |
| 列对齐错误 | 使用可视化调试:page.to_image().draw_rects() |
| OCR质量差 | 提高DPI,预处理图像,使用正确的语言 |
| 内存问题 | 一次处理一页,处理完成后关闭PDF |
资源
- 书籍《数据驱动型建造》作者:阿尔乔姆·博伊科,第2.4章
- 网站:https://datadrivenconstruction.io
- pdfplumber 文档:https://github.com/jsvine/pdfplumber
- Tesseract OCR:https://github.com/tesseract-ocr/tesseract
后续步骤
- 请参阅
image-to-data以了解图像处理 - 请参阅
cad-to-data以了解CAD/BIM数据提取 - 请参阅
etl-pipeline以了解自动化处理工作流 - 请参阅
data-quality-check以了解提取数据的验证
文章底部电脑广告
手机广告位-内容正文底部


微信扫一扫,打赏作者吧~