
Layout Normalization
Normalize document layouts and formatting for consistent processing across different sources.
Layout Normalization
Different sources have different layouts. Normalize for consistency.
Text Normalization
def normalize_text(text):
# Normalize Unicode
import unicodedata
text = unicodedata.normalize('NFKD', text)
# Normalize whitespace
text = ' '.join(text.split())
# Normalize line breaks
text = text.replace('\r\n', '\n')
# Normalize bullet points
text = text.replace('•', '*')
text = text.replace('·', '*')
return text
Multi-Column Layout
def detect_columns(page):
# Detect multi-column layout
if has_multiple_columns(page):
# Read left-to-right, top-to-bottom
return merge_columns(page)
return page.get_text()
Table Preservation
def preserve_table_structure(table):
# Convert to markdown table
headers = ' | '.join(table[0])
separator = '|'.join(['---'] * len(table[0]))
rows = [' | '.join(row) for row in table[1:]]
return f"{headers}\n{separator}\n" + '\n'.join(rows)
Next: Language detection.