from __future__ import annotations import argparse import hashlib import json import re from dataclasses import dataclass from pathlib import Path from typing import Iterable @dataclass(frozen=True) class ExtractedDoc: source_path: str text: str def _normalize_for_hash(text: str) -> str: text = text.replace("\u00ad", "") # soft hyphen text = text.replace("\u200b", "") # zero-width space text = text.lower() text = re.sub(r"[ \t]+", " ", text) text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def _sha256_text(text: str) -> str: return hashlib.sha256(text.encode("utf-8", errors="ignore")).hexdigest() def _extract_pdf(path: Path) -> str: from pypdf import PdfReader reader = PdfReader(str(path)) parts: list[str] = [] for page in reader.pages: try: parts.append(page.extract_text() or "") except Exception: parts.append("") return "\n".join(parts) def _extract_epub(path: Path) -> str: from bs4 import BeautifulSoup from ebooklib import ITEM_DOCUMENT, epub book = epub.read_epub(str(path)) parts: list[str] = [] for item in book.get_items(): if item.get_type() != ITEM_DOCUMENT: continue soup = BeautifulSoup(item.get_body_content(), "lxml") parts.append(soup.get_text("\n", strip=True)) return "\n".join(parts) def _read_text_file(path: Path) -> str: import chardet raw = path.read_bytes() guess = chardet.detect(raw) encoding = guess.get("encoding") or "utf-8" try: return raw.decode(encoding, errors="replace") except LookupError: return raw.decode("utf-8", errors="replace") def extract_text(path: Path) -> ExtractedDoc | None: suffix = path.suffix.lower() try: if suffix == ".pdf": return ExtractedDoc(str(path), _extract_pdf(path)) if suffix == ".epub": return ExtractedDoc(str(path), _extract_epub(path)) if suffix in {".txt", ".md"}: return ExtractedDoc(str(path), _read_text_file(path)) except Exception: return None return None def iter_candidate_files(root: Path) -> Iterable[Path]: exts = {".pdf", ".epub", ".txt", ".md"} for path in root.rglob("*"): if not path.is_file(): continue if path.suffix.lower() not in exts: continue yield path def main() -> int: parser = argparse.ArgumentParser(description="Extract and dedupe local documents into a plain-text corpus.") parser.add_argument("--input", type=Path, default=Path("eBooks"), help="Input directory to scan (default: eBooks).") parser.add_argument( "--out", type=Path, default=Path("training_data"), help="Output directory (default: training_data).", ) parser.add_argument( "--min-chars", type=int, default=2000, help="Skip extracted docs shorter than this (default: 2000).", ) args = parser.parse_args() in_dir: Path = args.input out_dir: Path = args.out out_text_dir = out_dir / "text" out_text_dir.mkdir(parents=True, exist_ok=True) manifest_path = out_dir / "manifest.json" corpus_path = out_dir / "corpus.txt" rejected_path = out_dir / "rejected.json" docs: dict[str, dict] = {} rejected: list[dict] = [] seen_hashes: set[str] = set() candidates = sorted(iter_candidate_files(in_dir)) for file_path in candidates: extracted = extract_text(file_path) if extracted is None: rejected.append({"path": str(file_path), "reason": "extract_failed"}) continue normalized = _normalize_for_hash(extracted.text) if len(normalized) < args.min_chars: rejected.append({"path": str(file_path), "reason": "too_short"}) continue doc_hash = _sha256_text(normalized) if doc_hash in seen_hashes: docs[doc_hash]["duplicates"].append(str(file_path)) continue seen_hashes.add(doc_hash) out_txt = out_text_dir / f"{doc_hash}.txt" out_txt.write_text(extracted.text, encoding="utf-8", errors="ignore") docs[doc_hash] = { "id": doc_hash, "primary": str(file_path), "duplicates": [], "chars": len(extracted.text), } manifest_path.write_text(json.dumps({"docs": list(docs.values())}, indent=2), encoding="utf-8") rejected_path.write_text(json.dumps(rejected, indent=2), encoding="utf-8") # Build concatenated corpus with corpus_path.open("w", encoding="utf-8", errors="ignore") as f: for doc in docs.values(): f.write("\n\n" + "=" * 80 + "\n") f.write(f"SOURCE: {doc['primary']}\n") f.write("=" * 80 + "\n\n") f.write((out_text_dir / f"{doc['id']}.txt").read_text(encoding="utf-8", errors="ignore")) f.write("\n") print(f"Extracted unique docs: {len(docs)}") print(f"Wrote corpus: {corpus_path}") print(f"Manifest: {manifest_path}") if rejected: print(f"Rejected: {len(rejected)} (see {rejected_path})") return 0 if __name__ == "__main__": raise SystemExit(main())