from __future__ import annotations import argparse import csv import hashlib import json import math import re from dataclasses import dataclass from pathlib import Path from typing import Iterable _OPTIONS_KEYWORDS: dict[str, float] = { # core "option": 2.0, "options": 2.0, "call": 1.0, "put": 1.0, "strike": 2.0, "expiration": 2.0, "expiry": 2.0, "premium": 2.0, "contract": 1.0, "underlying": 2.0, "open interest": 3.0, "bid-ask": 3.0, "bid ask": 3.0, "assignment": 3.0, "exercise": 2.0, "early exercise": 4.0, # greeks / vol "delta": 3.0, "gamma": 3.0, "theta": 3.0, "vega": 3.0, "rho": 2.0, "implied volatility": 4.0, "historical volatility": 3.0, "volatility smile": 3.0, "skew": 2.0, "iv": 1.5, # strategies "spread": 2.0, "vertical spread": 4.0, "calendar spread": 4.0, "diagonal spread": 4.0, "credit spread": 4.0, "debit spread": 4.0, "iron condor": 5.0, "butterfly": 3.0, "straddle": 4.0, "strangle": 4.0, "covered call": 5.0, "protective put": 5.0, "cash-secured put": 5.0, "ratio spread": 4.0, # risk / pricing "intrinsic value": 4.0, "time value": 4.0, "extrinsic value": 4.0, "breakeven": 3.0, "probability of profit": 4.0, "expected value": 3.0, "black-scholes": 5.0, "black scholes": 5.0, "binomial": 3.0, "greeks": 4.0, "margin": 2.0, "reg t": 2.0, "portfolio margin": 4.0, } _JUNK_PHRASES = [ "all rights reserved", "no part of this publication", "printed in", "publisher", "isbn", "library of congress", "copyright", "acknowledg", "about the author", "disclaimer", "warranty", ] @dataclass(frozen=True) class Segment: source_path: str locator: str # "page:123" or "section:foo" text: str score: float def _fix_drop_caps(text: str) -> str: # Join single-letter drop caps like "O ptions" -> "Options". for _ in range(6): fixed = re.sub(r"\b([A-Za-z])\s+(?=[a-z])", r"\1", text) if fixed == text: break text = fixed return text def _normalize(text: str) -> str: text = text.replace("\u00ad", "") # soft hyphen text = text.replace("\u200b", "") # zero-width space text = _fix_drop_caps(text) text = text.lower() text = re.sub(r"[ \t]+", " ", text) text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def _sha256(text: str) -> str: return hashlib.sha256(text.encode("utf-8", errors="ignore")).hexdigest() def _tokenish(text: str) -> list[str]: return re.findall(r"[a-z]{2,}", text.lower()) def _keyword_score(text: str) -> tuple[float, dict[str, int]]: t = " " + _normalize(text) + " " hits: dict[str, int] = {} score = 0.0 for kw, weight in _OPTIONS_KEYWORDS.items(): if " " in kw: n = t.count(" " + kw + " ") else: n = len(re.findall(rf"\b{re.escape(kw)}\b", t)) if n: hits[kw] = n score += weight * n return score, hits def _looks_like_toc(text: str) -> bool: t = _normalize(text) head = t[:400] if "table of contents" in head or re.search(r"\bcontents\b", head): return True lines = [ln.strip() for ln in t.splitlines() if ln.strip()] if len(lines) < 10: return False # Many lines ending with digits and/or dotted leaders end_num = sum(1 for ln in lines if re.search(r"(\\.{2,}|\\s)\\d{1,4}$", ln)) dotted = sum(1 for ln in lines if "..." in ln or re.search(r"\\.{4,}", ln)) shortish = sum(1 for ln in lines if len(ln) <= 60) if end_num / len(lines) >= 0.35 and (dotted + end_num) / len(lines) >= 0.35 and shortish / len(lines) >= 0.5: return True return False def _looks_like_index(text: str) -> bool: t = _normalize(text) head = t[:400] if re.search(r"^\\s*index\\b", head): return True lines = [ln.strip() for ln in t.splitlines() if ln.strip()] if len(lines) < 15: return False indexish = 0 for ln in lines[:200]: if re.search(r"\\b\\d{1,4}(?:,\\s*\\d{1,4}){2,}\\b", ln): indexish += 1 continue if re.search(r"^[a-z].{1,60}\\s+\\d{1,4}(?:,\\s*\\d{1,4})+\\b", ln): indexish += 1 continue return indexish >= max(10, math.ceil(0.25 * min(len(lines), 200))) def _looks_like_front_matter(text: str) -> bool: t = _normalize(text) head = t[:800] if any(p in head for p in _JUNK_PHRASES): return True # Too little prose toks = _tokenish(head) if len(toks) < 80 and (("isbn" in head) or ("copyright" in head)): return True return False def _is_junk(text: str) -> str | None: if _looks_like_toc(text): return "toc" if _looks_like_index(text): return "index" if _looks_like_front_matter(text): return "front_matter" return None def _iter_files(root: Path, include: list[str], exclude: list[str]) -> Iterable[Path]: exts = {".pdf", ".epub"} for p in root.rglob("*"): if p.is_file() and p.suffix.lower() in exts: path_lower = str(p).lower() if include and not any(token in path_lower for token in include): continue if exclude and any(token in path_lower for token in exclude): continue yield p def _extract_pdf_segments(path: Path) -> list[tuple[str, str]]: from pypdf import PdfReader reader = PdfReader(str(path)) out: list[tuple[str, str]] = [] for i, page in enumerate(reader.pages, start=1): try: txt = page.extract_text() or "" except Exception: txt = "" out.append((f"page:{i}", txt)) return out def _extract_epub_segments(path: Path) -> list[tuple[str, str]]: from bs4 import BeautifulSoup from ebooklib import ITEM_DOCUMENT, epub book = epub.read_epub(str(path)) out: list[tuple[str, str]] = [] idx = 0 for item in book.get_items(): if item.get_type() != ITEM_DOCUMENT: continue idx += 1 soup = BeautifulSoup(item.get_body_content(), "lxml") txt = soup.get_text("\n", strip=True) name = getattr(item, "file_name", None) or f"doc:{idx}" out.append((f"section:{name}", txt)) return out def main() -> int: parser = argparse.ArgumentParser(description="Select option-trading-relevant pages/sections from PDFs/EPUBs.") parser.add_argument("--input", type=Path, default=Path("eBooks")) parser.add_argument("--out", type=Path, default=Path("training_data/relevant")) parser.add_argument("--min-score", type=float, default=10.0) parser.add_argument("--front-matter-score", type=float, default=None) parser.add_argument("--min-chars", type=int, default=800) parser.add_argument("--neighbors", type=int, default=1, help="Include +/- N neighbor pages/sections around hits.") parser.add_argument( "--include", action="append", default=[], help="Only include files whose path contains this substring (case-insensitive).", ) parser.add_argument( "--exclude", action="append", default=[], help="Skip files whose path contains this substring (case-insensitive).", ) args = parser.parse_args() out_dir: Path = args.out text_dir = out_dir / "text" out_dir.mkdir(parents=True, exist_ok=True) text_dir.mkdir(parents=True, exist_ok=True) front_matter_min_score = args.front_matter_score if args.front_matter_score is not None else args.min_score include = [token.lower() for token in args.include] exclude = [token.lower() for token in args.exclude] seen_hashes: set[str] = set() selected: list[dict] = [] report_rows: list[dict] = [] for file_path in sorted(_iter_files(args.input, include, exclude)): suffix = file_path.suffix.lower() if suffix == ".pdf": segs = _extract_pdf_segments(file_path) elif suffix == ".epub": segs = _extract_epub_segments(file_path) else: continue scored: list[tuple[int, str, str, float, dict[str, int], str | None]] = [] for idx, (loc, txt) in enumerate(segs): if not txt or len(txt) < args.min_chars: scored.append((idx, loc, txt, 0.0, {}, "too_short")) continue score, hits = _keyword_score(txt) junk = _is_junk(txt) scored.append((idx, loc, txt, score, hits, junk)) keep_indices: set[int] = set() for idx, loc, txt, score, hits, junk in scored: if junk in {"toc", "index"}: continue if junk == "front_matter" and score < front_matter_min_score: continue if score < args.min_score: continue keep_indices.add(idx) for d in range(1, args.neighbors + 1): keep_indices.add(idx - d) keep_indices.add(idx + d) keep_indices = {i for i in keep_indices if 0 <= i < len(scored)} for idx, loc, txt, score, hits, junk in scored: if idx not in keep_indices: continue if not txt or len(txt) < args.min_chars: continue if junk in {"toc", "index"}: continue # For neighbor pages, allow some front matter, but only if score isn't near-zero if junk == "front_matter" and score < front_matter_min_score: continue norm = _normalize(txt) seg_hash = _sha256(norm) if seg_hash in seen_hashes: continue seen_hashes.add(seg_hash) (text_dir / f"{seg_hash}.txt").write_text(txt, encoding="utf-8", errors="ignore") src = str(file_path) primary = f"{src}#{loc}" selected.append( { "id": seg_hash, "primary": primary, "duplicates": [], "chars": len(txt), "score": score, "hits": hits, } ) report_rows.append( { "id": seg_hash, "source": src, "locator": loc, "score": f"{score:.2f}", "chars": str(len(txt)), "junk": junk or "", "top_hits": ";".join(sorted(hits.keys())[:12]), } ) manifest_path = out_dir / "manifest.json" manifest_path.write_text(json.dumps({"docs": selected}, indent=2), encoding="utf-8") report_path = out_dir / "report.csv" with report_path.open("w", encoding="utf-8", newline="") as f: writer = csv.DictWriter( f, fieldnames=["id", "source", "locator", "score", "chars", "junk", "top_hits"], ) writer.writeheader() writer.writerows(report_rows) corpus_path = out_dir / "corpus.txt" with corpus_path.open("w", encoding="utf-8", errors="ignore") as f: for doc in selected: f.write("\n\n" + "=" * 80 + "\n") f.write(f"SOURCE: {doc['primary']}\n") f.write(f"SCORE: {doc.get('score', 0):.2f}\n") f.write("=" * 80 + "\n\n") f.write((text_dir / f"{doc['id']}.txt").read_text(encoding="utf-8", errors="ignore")) f.write("\n") print(f"Selected segments: {len(selected)}") print(f"Manifest: {manifest_path}") print(f"Report: {report_path}") print(f"Corpus: {corpus_path}") return 0 if __name__ == "__main__": raise SystemExit(main())