You will build a pipeline that takes scanned or PDF invoices, extracts structured data (vendor, line items, totals, VAT, payment terms, bank details), and outputs accounting-system-ready records. A mid-size company processing 2,400 invoices per month reduces data entry time from 8 minutes per invoice to 15 seconds of human verification. The full pipeline runs on a single dedicated GPU server.
Pipeline Architecture
| Stage | Tool | Output |
|---|---|---|
| 1. Document OCR | PaddleOCR + layout | Text with spatial data |
| 2. Table detection | Table Transformer | Line item tables |
| 3. Field extraction | LLaMA 3.1 8B | Structured invoice data |
| 4. Validation | Business rules | Verified records |
Invoice OCR with Layout Analysis
from paddleocr import PaddleOCR
from pdf2image import convert_from_path
import numpy as np
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True,
det_db_box_thresh=0.3) # Lower threshold for faint text
def extract_invoice_text(file_path: str) -> dict:
if file_path.endswith('.pdf'):
images = convert_from_path(file_path, dpi=300)
else:
images = [Image.open(file_path)]
all_blocks = []
for page_num, img in enumerate(images):
result = ocr.ocr(np.array(img), cls=True)
for line in result[0]:
bbox, (text, conf) = line[0], line[1]
if conf > 0.6:
all_blocks.append({
"text": text, "confidence": conf,
"bbox": bbox, "page": page_num,
"y_center": (bbox[0][1] + bbox[2][1]) / 2,
"x_center": (bbox[0][0] + bbox[2][0]) / 2
})
return {"blocks": all_blocks, "pages": len(images)}
PaddleOCR preserves bounding box coordinates for each text block. This spatial information is critical for distinguishing line items from headers and footers in invoice layouts.
Table Detection and Extraction
from transformers import TableTransformerForDetection, DetrImageProcessor
from PIL import Image
processor = DetrImageProcessor()
table_model = TableTransformerForDetection.from_pretrained(
"microsoft/table-transformer-detection"
).to("cuda")
def detect_tables(image: Image.Image) -> list:
inputs = processor(images=image, return_tensors="pt").to("cuda")
outputs = table_model(**inputs)
target_sizes = torch.tensor([image.size[::-1]]).to("cuda")
results = processor.post_process_object_detection(
outputs, threshold=0.7, target_sizes=target_sizes
)[0]
tables = []
for score, box in zip(results["scores"], results["boxes"]):
x1, y1, x2, y2 = box.tolist()
tables.append({"bbox": [x1, y1, x2, y2],
"confidence": score.item()})
return tables
LLM Field Extraction
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8000/v1", api_key="none")
def extract_invoice_fields(text_blocks: list, table_text: str) -> dict:
full_text = "\n".join([b["text"] for b in text_blocks])
response = client.chat.completions.create(
model="meta-llama/Llama-3.1-8B-Instruct",
messages=[{
"role": "system",
"content": """Extract invoice data. Return JSON:
{"vendor": {"name": "", "address": "", "vat_number": ""},
"invoice_number": "", "invoice_date": "", "due_date": "",
"payment_terms": "", "currency": "",
"line_items": [{"description": "", "quantity": 0,
"unit_price": 0.0, "vat_rate": 0.0, "line_total": 0.0}],
"subtotal": 0.0, "vat_total": 0.0, "total": 0.0,
"bank_details": {"sort_code": "", "account": "", "iban": ""},
"purchase_order": ""}"""
}, {"role": "user", "content": f"Invoice text:\n{full_text}\n\nTable data:\n{table_text}"}],
max_tokens=1500, temperature=0.0
)
return parse_json(response.choices[0].message.content)
The vLLM server handles field extraction with zero temperature for deterministic output. Providing both raw text and detected table data helps the LLM correctly identify line items versus header information.
Business Rule Validation
def validate_invoice(data: dict) -> dict:
errors = []
# Verify line item totals
for i, item in enumerate(data.get("line_items", [])):
expected = round(item["quantity"] * item["unit_price"], 2)
if abs(expected - item["line_total"]) > 0.01:
errors.append(f"Line {i+1}: calculated {expected}, extracted {item['line_total']}")
# Verify subtotal
line_sum = sum(item["line_total"] for item in data.get("line_items", []))
if abs(line_sum - data.get("subtotal", 0)) > 0.05:
errors.append(f"Subtotal mismatch: lines={line_sum}, stated={data['subtotal']}")
# Verify VAT
expected_vat = sum(item["line_total"] * item["vat_rate"] / 100
for item in data.get("line_items", []))
if abs(expected_vat - data.get("vat_total", 0)) > 0.10:
errors.append(f"VAT mismatch: calculated={expected_vat:.2f}, stated={data['vat_total']}")
data["validation"] = {"passed": len(errors) == 0, "errors": errors}
return data
Integration and Production
Export validated invoice data to your accounting system via API (Xero, QuickBooks, Sage) or CSV. Flag invoices with validation errors for manual review. An RTX 5090 (24 GB) processes 200+ invoices per hour, handling the monthly volume of 2,400 invoices in a single batch run. Deploy on private infrastructure to keep financial documents confidential. See document AI hosting for OCR setup, chatbot hosting for invoice query interfaces, GDPR compliance, more tutorials, and use cases.
Invoice Processing GPU Servers
Process thousands of invoices per month with OCR and LLM extraction on dedicated UK GPU infrastructure.
Browse GPU Servers