epstein-docs.github.io/deduplicate_types.py
2025-10-07 11:55:54 +11:00

411 lines
14 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Document type deduplication script using LLM to merge similar types.
Groups document type variations (e.g., "Deposition", "deposition", "Deposition Transcript")
into canonical types.
"""
import os
import json
import re
from pathlib import Path
from typing import Dict, List
from collections import Counter
from openai import OpenAI
from dotenv import load_dotenv
class DocumentTypeDeduplicator:
"""Deduplicate document types using LLM"""
def __init__(self, api_url: str, api_key: str, model: str = "gpt-4o"):
self.client = OpenAI(api_key=api_key, base_url=api_url)
self.model = model
self.results_dir = Path("./results")
self.output_file = Path("./dedupe_types.json")
def collect_document_types(self) -> Counter:
"""Collect all document types from JSON files"""
types = []
for json_file in self.results_dir.glob("**/*.json"):
try:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
doc_type = data.get('document_metadata', {}).get('document_type')
if doc_type:
types.append(str(doc_type).strip())
except Exception as e:
print(f"Warning: Could not read {json_file}: {e}")
return Counter(types)
def _deduplicate_in_batches(self, unique_types: List[str], type_counts: Counter) -> Dict[str, str]:
"""Deduplicate types in batches to handle large numbers"""
batch_size = 100
all_mappings = {}
canonical_to_variants = {}
# First pass: deduplicate in batches
for i in range(0, len(unique_types), batch_size):
batch = unique_types[i:i+batch_size]
print(f" Processing batch {i//batch_size + 1}/{(len(unique_types) + batch_size - 1)//batch_size} ({len(batch)} types)...")
try:
batch_mappings = self._deduplicate_single_batch(batch)
# Collect mappings and track canonical types
for original, canonical in batch_mappings.items():
all_mappings[original] = canonical
if canonical not in canonical_to_variants:
canonical_to_variants[canonical] = []
canonical_to_variants[canonical].append(original)
except Exception as e:
print(f" Warning: Failed to process batch, using original names: {e}")
for t in batch:
all_mappings[t] = t
if t not in canonical_to_variants:
canonical_to_variants[t] = []
canonical_to_variants[t].append(t)
# Second pass: deduplicate the canonical types themselves
# (in case different batches created similar canonical types)
print(f"\n📋 Batch processing created {len(canonical_to_variants)} unique canonical types")
print(f"Running final deduplication pass to merge any duplicates across batches...")
try:
canonical_types = list(canonical_to_variants.keys())
canonical_mappings = self._deduplicate_final_pass(canonical_types)
# Apply final canonical deduplication
for original, first_canonical in all_mappings.items():
final_canonical = canonical_mappings.get(first_canonical, first_canonical)
all_mappings[original] = final_canonical
# Count final canonicals
final_canonicals = set(all_mappings.values())
print(f"✅ Final deduplication reduced {len(canonical_to_variants)}{len(final_canonicals)} canonical types")
except Exception as e:
print(f" Warning: Failed to deduplicate canonical types: {e}")
return all_mappings
def _deduplicate_final_pass(self, canonical_types: List[str]) -> Dict[str, str]:
"""Final deduplication pass for canonical types from different batches"""
if len(canonical_types) <= 1:
return {t: t for t in canonical_types}
prompt = f"""You are a legal document classifier performing a FINAL CLEANUP pass on canonical document types.
Your task: Merge any remaining duplicate or very similar canonical types.
⚠️⚠️⚠️ CRITICAL RULES ⚠️⚠️⚠️
1. These are ALREADY canonical types, so be conservative
2. ONLY merge if types are truly the same thing with different names:
- "Deposition" and "Deposition Transcript""Deposition"
- "Court Filing" and "Court Document""Court Filing"
- "Email" and "E-mail""Email"
3. DO NOT merge types that are legitimately different:
- "Letter" and "Email" are DIFFERENT (keep separate)
- "Affidavit" and "Declaration" are DIFFERENT (keep separate)
- "Motion" and "Memorandum" are DIFFERENT (keep separate)
4. Prefer the SHORTER, simpler canonical name when merging
5. Use these standard canonical types when possible:
- Deposition
- Court Filing
- Letter
- Email
- Affidavit
- Motion
- Subpoena
- Flight Log
- Financial Record
- Contract
- Memorandum
- Transcript
- Exhibit
- Declaration
- Report
Here are the canonical types to review (sorted alphabetically):
{json.dumps(sorted(canonical_types), indent=2)}
Return ONLY valid JSON mapping each type to its final canonical form:
{{
"Type 1": "Final Canonical Type",
"Type 2": "Final Canonical Type",
...
}}
If a type is already perfect, map it to itself."""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=0.0,
max_tokens=4000
)
content = response.choices[0].message.content.strip()
# Extract JSON
json_match = re.search(r'```(?:json)?\s*\n(.*?)\n```', content, re.DOTALL)
if json_match:
content = json_match.group(1).strip()
else:
json_match = re.search(r'\{.*\}', content, re.DOTALL)
if json_match:
content = json_match.group(0).strip()
else:
# Brace-counting fallback
start = content.find('{')
if start >= 0:
brace_count = 0
for i in range(start, len(content)):
if content[i] == '{':
brace_count += 1
elif content[i] == '}':
brace_count -= 1
if brace_count == 0:
content = content[start:i+1]
break
try:
mappings = json.loads(content)
except json.JSONDecodeError as e:
print(f"Failed to parse JSON response in final pass. First 500 chars:")
print(content[:500])
raise
# Validate mappings
validated_mappings = {}
for original, canonical in mappings.items():
canonical = str(canonical).strip()
if not canonical:
canonical = original
validated_mappings[original] = canonical
return validated_mappings
def _deduplicate_single_batch(self, types: List[str]) -> Dict[str, str]:
"""Deduplicate a single batch of types"""
prompt = f"""You are a legal document classifier. Your task is to group similar document type labels into standardized canonical types.
⚠️⚠️⚠️ CRITICAL RULES ⚠️⚠️⚠️
1. The canonical type MUST be a clean, professional document type name
2. Use title case (e.g., "Deposition", "Court Filing", "Email")
3. Merge variations that mean the same thing:
- "deposition""Deposition"
- "DEPOSITION""Deposition"
- "deposition transcript""Deposition"
- "dep""Deposition"
4. Common canonical types to use:
- Deposition
- Court Filing
- Letter
- Email
- Affidavit
- Motion
- Subpoena
- Flight Log
- Financial Record
- Contract
- Memorandum
- Transcript
- Exhibit
- Declaration
- Report
- Unknown (only if truly unidentifiable)
5. Be generous with merging - if types are similar, merge them
6. Prefer shorter, cleaner canonical names
Here are the document types to deduplicate:
{json.dumps(types, indent=2)}
Return ONLY valid JSON in this exact format:
{{
"document_type_1": "Canonical Type",
"document_type_2": "Canonical Type",
...
}}
Map every input type to its canonical form. If a type is already clean, map it to itself."""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=0.0,
max_tokens=4000
)
content = response.choices[0].message.content.strip()
# Extract JSON
json_match = re.search(r'```(?:json)?\s*\n(.*?)\n```', content, re.DOTALL)
if json_match:
content = json_match.group(1).strip()
else:
json_match = re.search(r'\{.*\}', content, re.DOTALL)
if json_match:
content = json_match.group(0).strip()
else:
# Brace-counting fallback
start = content.find('{')
if start >= 0:
brace_count = 0
for i in range(start, len(content)):
if content[i] == '{':
brace_count += 1
elif content[i] == '}':
brace_count -= 1
if brace_count == 0:
content = content[start:i+1]
break
try:
mappings = json.loads(content)
except json.JSONDecodeError as e:
print(f"Failed to parse JSON response. First 500 chars:")
print(content[:500])
raise
# Validate and clean up mappings
validated_mappings = {}
for original, canonical in mappings.items():
canonical = str(canonical).strip()
if not canonical:
canonical = "Unknown"
validated_mappings[original] = canonical
return validated_mappings
def deduplicate_types(self, type_counts: Counter) -> Dict[str, str]:
"""Use LLM to deduplicate document types"""
# Get unique types sorted by frequency
unique_types = sorted(type_counts.keys(), key=lambda x: type_counts[x], reverse=True)
print(f"Found {len(unique_types)} unique document types")
# If too many types, process in batches
if len(unique_types) > 100:
print(f"Processing in batches (too many types for single request)...")
return self._deduplicate_in_batches(unique_types, type_counts)
print(f"Processing single batch deduplication...")
mappings = self._deduplicate_single_batch(unique_types)
# Get canonical types
canonical_types = list(set(mappings.values()))
print(f"\n📋 Initial deduplication created {len(canonical_types)} canonical types")
# Do a final review pass
if len(canonical_types) > 1:
print(f"Running final review pass for cleanup...")
try:
final_mappings = self._deduplicate_final_pass(canonical_types)
# Apply final pass
for original, first_canonical in mappings.items():
final_canonical = final_mappings.get(first_canonical, first_canonical)
mappings[original] = final_canonical
final_canonicals = set(mappings.values())
print(f"✅ Final review reduced {len(canonical_types)}{len(final_canonicals)} canonical types")
except Exception as e:
print(f" Warning: Final review failed: {e}")
return mappings
def save_mappings(self, mappings: Dict[str, str], type_counts: Counter):
"""Save deduplication mappings to JSON file"""
# Get stats
canonical_types = set(mappings.values())
total_docs = sum(type_counts.values())
output = {
"stats": {
"original_types": len(mappings),
"canonical_types": len(canonical_types),
"total_documents": total_docs,
"reduction_percentage": round((1 - len(canonical_types) / len(mappings)) * 100, 1)
},
"mappings": mappings
}
with open(self.output_file, 'w', encoding='utf-8') as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"\n✅ Saved type mappings to {self.output_file}")
print(f" Original types: {len(mappings)}")
print(f" Canonical types: {len(canonical_types)}")
print(f" Reduction: {output['stats']['reduction_percentage']}%")
# Show canonical type breakdown
canonical_counts = Counter()
for original, canonical in mappings.items():
canonical_counts[canonical] += type_counts[original]
print(f"\n📊 Top canonical types:")
for canonical, count in canonical_counts.most_common(10):
print(f" {canonical}: {count} documents")
def main():
load_dotenv()
import argparse
parser = argparse.ArgumentParser(description="Deduplicate document types using LLM")
parser.add_argument("--api-url", help="OpenAI-compatible API base URL")
parser.add_argument("--api-key", help="API key")
parser.add_argument("--model", help="Model name")
args = parser.parse_args()
api_url = args.api_url or os.getenv("OPENAI_API_URL")
api_key = args.api_key or os.getenv("OPENAI_API_KEY")
model = args.model or os.getenv("OPENAI_MODEL", "gpt-4o")
if not api_url or not api_key:
print("Error: API URL and API key are required")
print("Set OPENAI_API_URL and OPENAI_API_KEY in .env or pass via --api-url and --api-key")
return 1
print("=" * 60)
print("DOCUMENT TYPE DEDUPLICATION")
print("=" * 60)
deduplicator = DocumentTypeDeduplicator(api_url, api_key, model)
# Collect all document types
type_counts = deduplicator.collect_document_types()
if not type_counts:
print("No document types found in results directory")
return 1
# Deduplicate using LLM
mappings = deduplicator.deduplicate_types(type_counts)
# Save results
deduplicator.save_mappings(mappings, type_counts)
print("\n✅ Done! Update .eleventy.js to load dedupe_types.json")
if __name__ == "__main__":
exit(main() or 0)