Data Extraction Assistant
Extract structured data from PDFs, Word docs, emails, and HTML using the unstructured library.
Your data is trapped in PDFs, Word docs, emails, and HTML pages. Manually copying it into spreadsheets is mind-numbing, error-prone, and completely unscalable.
Who it's for: operations teams processing documents into databases, analysts pulling data from unstructured sources, developers building document processing pipelines, compliance teams extracting information from filings, anyone manually copy-pasting from PDFs
Example
"Extract all vendor names, amounts, and dates from these 200 PDF invoices" → Structured CSV output with consistent formatting, regardless of whether the source was a PDF, Word doc, or email — ready to import into your database
New here? 3-minute setup guide → | Already set up? Copy the template below.
# Data Extractor
## Overview
This workflow enables extraction of structured data from any document format using **unstructured** - a unified library for processing PDFs, Word docs, emails, HTML, and more. Get consistent, structured output regardless of input format.
## How to Use
1. Provide the document to process
2. Optionally specify extraction options
3. I'll extract structured elements with metadata
**Example prompts:**
- "Extract all text and tables from this PDF"
- "Parse this email and get the body, attachments, and metadata"
- "Convert this HTML page to structured elements"
- "Extract data from these mixed-format documents"
## Domain Knowledge
### unstructured Fundamentals
```python
from unstructured.partition.auto import partition
# Automatically detect and process any document
elements = partition("document.pdf")
# Access extracted elements
for element in elements:
print(f"Type: {type(element).__name__}")
print(f"Text: {element.text}")
print(f"Metadata: {element.metadata}")
```
### Supported Formats
| Format | Function | Notes |
|--------|----------|-------|
| PDF | `partition_pdf` | Native + scanned |
| Word | `partition_docx` | Full structure |
| PowerPoint | `partition_pptx` | Slides & notes |
| Excel | `partition_xlsx` | Sheets & tables |
| Email | `partition_email` | Body & attachments |
| HTML | `partition_html` | Tags preserved |
| Markdown | `partition_md` | Structure preserved |
| Plain Text | `partition_text` | Basic parsing |
| Images | `partition_image` | OCR extraction |
### Element Types
```python
from unstructured.documents.elements import (
Title,
NarrativeText,
Text,
ListItem,
Table,
Image,
Header,
Footer,
PageBreak,
Address,
EmailAddress,
)
# Elements have consistent structure
element.text # Raw text content
element.metadata # Rich metadata
element.category # Element type
element.id # Unique identifier
```
### Auto Partition
```python
from unstructured.partition.auto import partition
# Process any file type
elements = partition(
filename="document.pdf",
strategy="auto", # or "fast", "hi_res", "ocr_only"
include_metadata=True,
include_page_breaks=True,
)
# Filter by type
titles = [e for e in elements if isinstance(e, Title)]
tables = [e for e in elements if isinstance(e, Table)]
```
### Format-Specific Partitioning
```python
# PDF with options
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="document.pdf",
strategy="hi_res", # High quality extraction
infer_table_structure=True, # Detect tables
include_page_breaks=True,
languages=["en"], # OCR language
)
# Word documents
from unstructured.partition.docx import partition_docx
elements = partition_docx(
filename="document.docx",
include_metadata=True,
)
# HTML
from unstructured.partition.html import partition_html
elements = partition_html(
filename="page.html",
include_metadata=True,
)
```
### Working with Tables
```python
from unstructured.partition.auto import partition
elements = partition("report.pdf", infer_table_structure=True)
# Extract tables
for element in elements:
if element.category == "Table":
print("Table found:")
print(element.text)
# Access structured table data
if hasattr(element, 'metadata') and element.metadata.text_as_html:
print("HTML:", element.metadata.text_as_html)
```
### Metadata Access
```python
from unstructured.partition.auto import partition
elements = partition("document.pdf")
for element in elements:
meta = element.metadata
# Common metadata fields
print(f"Page: {meta.page_number}")
print(f"Filename: {meta.filename}")
print(f"Filetype: {meta.filetype}")
print(f"Coordinates: {meta.coordinates}")
print(f"Languages: {meta.languages}")
```
### Chunking for AI/RAG
```python
from unstructured.partition.auto import partition
from unstructured.chunking.title import chunk_by_title
from unstructured.chunking.basic import chunk_elements
# Partition document
elements = partition("document.pdf")
# Chunk by title (semantic chunks)
chunks = chunk_by_title(
elements,
max_characters=1000,
combine_text_under_n_chars=200,
)
# Or basic chunking
chunks = chunk_elements(
elements,
max_characters=500,
overlap=50,
)
for chunk in chunks:
print(f"Chunk ({len(chunk.text)} chars):")
print(chunk.text[:100] + "...")
```
### Batch Processing
```python
from unstructured.partition.auto import partition
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
def process_document(file_path):
"""Process single document."""
try:
elements = partition(str(file_path))
return {
'file': str(file_path),
'status': 'success',
'elements': len(elements),
'text': '\n\n'.join([e.text for e in elements])
}
except Exception as e:
return {
'file': str(file_path),
'status': 'error',
'error': str(e)
}
def batch_process(input_dir, max_workers=4):
"""Process all documents in directory."""
input_path = Path(input_dir)
files = list(input_path.glob('*'))
with ThreadPoolExecutor(max_workers=max_workers) as executor:
results = list(executor.map(process_document, files))
return results
```
### Export Formats
```python
from unstructured.partition.auto import partition
from unstructured.staging.base import elements_to_json, elements_to_dicts
elements = partition("document.pdf")
# To JSON string
json_str = elements_to_json(elements)
# To list of dicts
dicts = elements_to_dicts(elements)
# To DataFrame
import pandas as pd
df = pd.DataFrame(dicts)
```
## Best Practices
1. **Choose Strategy Wisely**: "fast" for speed, "hi_res" for accuracy
2. **Enable Table Detection**: For documents with tables
3. **Specify Language**: For better OCR on non-English docs
4. **Chunk for RAG**: Use semantic chunking for AI applications
5. **Handle Errors**: Some formats may fail gracefully
## Common Patterns
### Document to JSON
```python
def document_to_json(file_path, output_path=None):
"""Convert document to structured JSON."""
from unstructured.partition.auto import partition
from unstructured.staging.base import elements_to_json
import json
elements = partition(file_path)
# Create structured output
output = {
'source': file_path,
'elements': []
}
for element in elements:
output['elements'].append({
'type': type(element).__name__,
'text': element.text,
'metadata': {
'page': element.metadata.page_number,
'coordinates': element.metadata.coordinates.to_dict() if element.metadata.coordinates else None
}
})
if output_path:
with open(output_path, 'w') as f:
json.dump(output, f, indent=2)
return output
```
### Email Parser
```python
from unstructured.partition.email import partition_email
def parse_email(email_path):
"""Extract structured data from email."""
elements = partition_email(email_path)
email_data = {
'subject': None,
'from': None,
'to': [],
'date': None,
'body': [],
'attachments': []
}
for element in elements:
meta = element.metadata
# Extract headers from metadata
if meta.subject:
email_data['subject'] = meta.subject
if meta.sent_from:
email_data['from'] = meta.sent_from
if meta.sent_to:
email_data['to'] = meta.sent_to
# Body content
email_data['body'].append({
'type': type(element).__name__,
'text': element.text
})
return email_data
```
## Examples
### Example 1: Research Paper Extraction
```python
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
def extract_paper(pdf_path):
"""Extract structured data from research paper."""
elements = partition_pdf(
filename=pdf_path,
strategy="hi_res",
infer_table_structure=True,
include_page_breaks=True
)
paper = {
'title': None,
'abstract': None,
'sections': [],
'tables': [],
'references': []
}
# Find title (usually first Title element)
for element in elements:
if element.category == "Title" and not paper['title']:
paper['title'] = element.text
break
# Extract tables
for element in elements:
if element.category == "Table":
paper['tables'].append({
'page': element.metadata.page_number,
'content': element.text,
'html': element.metadata.text_as_html if hasattr(element.metadata, 'text_as_html') else None
})
# Chunk into sections
chunks = chunk_by_title(elements, max_characters=2000)
current_section = None
for chunk in chunks:
if chunk.category == "Title":
paper['sections'].append({
'title': chunk.text,
'content': ''
})
elif paper['sections']:
paper['sections'][-1]['content'] += chunk.text + '\n'
return paper
paper = extract_paper('research_paper.pdf')
print(f"Title: {paper['title']}")
print(f"Tables: {len(paper['tables'])}")
print(f"Sections: {len(paper['sections'])}")
```
### Example 2: Invoice Data Extraction
```python
from unstructured.partition.auto import partition
import re
def extract_invoice_data(file_path):
"""Extract key data from invoice."""
elements = partition(file_path, strategy="hi_res")
# Combine all text
full_text = '\n'.join([e.text for e in elements])
invoice = {
'invoice_number': None,
'date': None,
'total': None,
'vendor': None,
'line_items': [],
'tables': []
}
# Extract patterns
inv_match = re.search(r'Invoice\s*#?\s*:?\s*(\w+[-\w]*)', full_text, re.I)
if inv_match:
invoice['invoice_number'] = inv_match.group(1)
date_match = re.search(r'Date\s*:?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})', full_text, re.I)
if date_match:
invoice['date'] = date_match.group(1)
total_match = re.search(r'Total\s*:?\s*\$?([\d,]+\.?\d*)', full_text, re.I)
if total_match:
invoice['total'] = float(total_match.group(1).replace(',', ''))
# Extract tables
for element in elements:
if element.category == "Table":
invoice['tables'].append(element.text)
return invoice
invoice = extract_invoice_data('invoice.pdf')
print(f"Invoice #: {invoice['invoice_number']}")
print(f"Total: ${invoice['total']}")
```
### Example 3: Document Corpus Builder
```python
from unstructured.partition.auto import partition
from unstructured.chunking.title import chunk_by_title
from pathlib import Path
import json
def build_corpus(input_dir, output_path):
"""Build searchable corpus from document collection."""
input_path = Path(input_dir)
corpus = []
# Support multiple formats
patterns = ['*.pdf', '*.docx', '*.html', '*.txt', '*.md']
files = []
for pattern in patterns:
files.extend(input_path.glob(pattern))
for file in files:
print(f"Processing: {file.name}")
try:
elements = partition(str(file))
chunks = chunk_by_title(elements, max_characters=1000)
for i, chunk in enumerate(chunks):
corpus.append({
'id': f"{file.stem}_{i}",
'source': str(file),
'type': type(chunk).__name__,
'text': chunk.text,
'page': chunk.metadata.page_number if chunk.metadata.page_number else None
})
except Exception as e:
print(f" Error: {e}")
# Save corpus
with open(output_path, 'w') as f:
json.dump(corpus, f, indent=2)
print(f"Corpus built: {len(corpus)} chunks from {len(files)} files")
return corpus
corpus = build_corpus('./documents', 'corpus.json')
```
## Limitations
- Complex layouts may need manual review
- OCR quality depends on image quality
- Large files may need chunking
- Some proprietary formats not supported
- API rate limits for cloud processing
## Installation
```bash
# Basic installation
pip install unstructured
# With all dependencies
pip install "unstructured[all-docs]"
# For PDF processing
pip install "unstructured[pdf]"
# For specific formats
pip install "unstructured[docx,pptx,xlsx]"
```
## Resources
- [unstructured GitHub](https://github.com/Unstructured-IO/unstructured)
- [Documentation](https://unstructured-io.github.io/unstructured/)
- [Unstructured API](https://unstructured.io/api-key)What This Does
This workflow enables extraction of structured data from any document format using unstructured - a unified library for processing PDFs, Word docs, emails, HTML, and more. Get consistent, structured output regardless of input format.
Quick Start
Step 1: Create a Project Folder
mkdir -p ~/Documents/DataExtractor
Step 2: Download the Template
Click Download above, then:
mv ~/Downloads/CLAUDE.md ~/Documents/DataExtractor/
Step 3: Start Working
cd ~/Documents/DataExtractor
claude
How to Use
- Provide the document to process
- Optionally specify extraction options
- I'll extract structured elements with metadata
Example prompts:
- "Extract all text and tables from this PDF"
- "Parse this email and get the body, attachments, and metadata"
- "Convert this HTML page to structured elements"
- "Extract data from these mixed-format documents"
Best Practices
- Choose Strategy Wisely: "fast" for speed, "hi_res" for accuracy
- Enable Table Detection: For documents with tables
- Specify Language: For better OCR on non-English docs
- Chunk for RAG: Use semantic chunking for AI applications
- Handle Errors: Some formats may fail gracefully
Examples
Example 1: Research Paper Extraction
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
def extract_paper(pdf_path):
"""Extract structured data from research paper."""
elements = partition_pdf(
filename=pdf_path,
strategy="hi_res",
infer_table_structure=True,
include_page_breaks=True
)
paper = {
'title': None,
'abstract': None,
'sections': [],
'tables': [],
'references': []
}
# Find title (usually first Title element)
for element in elements:
if element.category == "Title" and not paper['title']:
paper['title'] = element.text
break
# Extract tables
for element in elements:
if element.category == "Table":
paper['tables'].append({
'page': element.metadata.page_number,
'content': element.text,
'html': element.metadata.text_as_html if hasattr(element.metadata, 'text_as_html') else None
})
# Chunk into sections
chunks = chunk_by_title(elements, max_characters=2000)
current_section = None
for chunk in chunks:
if chunk.category == "Title":
paper['sections'].append({
'title': chunk.text,
'content': ''
})
elif paper['sections']:
paper['sections'][-1]['content'] += chunk.text + '\n'
return paper
paper = extract_paper('research_paper.pdf')
print(f"Title: {paper['title']}")
print(f"Tables: {len(paper['tables'])}")
print(f"Sections: {len(paper['sections'])}")
Example 2: Invoice Data Extraction
from unstructured.partition.auto import partition
import re
def extract_invoice_data(file_path):
"""Extract key data from invoice."""
elements = partition(file_path, strategy="hi_res")
# Combine all text
full_text = '\n'.join([e.text for e in elements])
invoice = {
'invoice_number': None,
'date': None,
'total': None,
'vendor': None,
'line_items': [],
'tables': []
}
# Extract patterns
inv_match = re.search(r'Invoice\s*#?\s*:?\s*(\w+[-\w]*)', full_text, re.I)
if inv_match:
invoice['invoice_number'] = inv_match.group(1)
date_match = re.search(r'Date\s*:?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})', full_text, re.I)
if date_match:
invoice['date'] = date_match.group(1)
total_match = re.search(r'Total\s*:?\s*\$?([\d,]+\.?\d*)', full_text, re.I)
if total_match:
invoice['total'] = float(total_match.group(1).replace(',', ''))
# Extract tables
for element in elements:
if element.category == "Table":
invoice['tables'].append(element.text)
return invoice
invoice = extract_invoice_data('invoice.pdf')
print(f"Invoice #: {invoice['invoice_number']}")
print(f"Total: ${invoice['total']}")
Example 3: Document Corpus Builder
Limitations
- Complex layouts may need manual review
- OCR quality depends on image quality
- Large files may need chunking
- Some proprietary formats not supported
- API rate limits for cloud processing
Installation
# Basic installation
pip install unstructured
# With all dependencies
pip install "unstructured[all-docs]"
# For PDF processing
pip install "unstructured[pdf]"
# For specific formats
pip install "unstructured[docx,pptx,xlsx]"