ollama-model-training-5060ti/tools/select_relevant.py

from __future__ import annotations

import argparse
import csv
import hashlib
import json
import math
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable


_OPTIONS_KEYWORDS: dict[str, float] = {
    # core
    "option": 2.0,
    "options": 2.0,
    "call": 1.0,
    "put": 1.0,
    "strike": 2.0,
    "expiration": 2.0,
    "expiry": 2.0,
    "premium": 2.0,
    "contract": 1.0,
    "underlying": 2.0,
    "open interest": 3.0,
    "bid-ask": 3.0,
    "bid ask": 3.0,
    "assignment": 3.0,
    "exercise": 2.0,
    "early exercise": 4.0,
    # greeks / vol
    "delta": 3.0,
    "gamma": 3.0,
    "theta": 3.0,
    "vega": 3.0,
    "rho": 2.0,
    "implied volatility": 4.0,
    "historical volatility": 3.0,
    "volatility smile": 3.0,
    "skew": 2.0,
    "iv": 1.5,
    # strategies
    "spread": 2.0,
    "vertical spread": 4.0,
    "calendar spread": 4.0,
    "diagonal spread": 4.0,
    "credit spread": 4.0,
    "debit spread": 4.0,
    "iron condor": 5.0,
    "butterfly": 3.0,
    "straddle": 4.0,
    "strangle": 4.0,
    "covered call": 5.0,
    "protective put": 5.0,
    "cash-secured put": 5.0,
    "ratio spread": 4.0,
    # risk / pricing
    "intrinsic value": 4.0,
    "time value": 4.0,
    "extrinsic value": 4.0,
    "breakeven": 3.0,
    "probability of profit": 4.0,
    "expected value": 3.0,
    "black-scholes": 5.0,
    "black scholes": 5.0,
    "binomial": 3.0,
    "greeks": 4.0,
    "margin": 2.0,
    "reg t": 2.0,
    "portfolio margin": 4.0,
}

_JUNK_PHRASES = [
    "all rights reserved",
    "no part of this publication",
    "printed in",
    "publisher",
    "isbn",
    "library of congress",
    "copyright",
    "acknowledg",
    "about the author",
    "disclaimer",
    "warranty",
]


@dataclass(frozen=True)
class Segment:
    source_path: str
    locator: str  # "page:123" or "section:foo"
    text: str
    score: float


def _fix_drop_caps(text: str) -> str:
    # Join single-letter drop caps like "O ptions" -> "Options".
    for _ in range(6):
        fixed = re.sub(r"\b([A-Za-z])\s+(?=[a-z])", r"\1", text)
        if fixed == text:
            break
        text = fixed
    return text


def _normalize(text: str) -> str:
    text = text.replace("\u00ad", "")  # soft hyphen
    text = text.replace("\u200b", "")  # zero-width space
    text = _fix_drop_caps(text)
    text = text.lower()
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def _sha256(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8", errors="ignore")).hexdigest()


def _tokenish(text: str) -> list[str]:
    return re.findall(r"[a-z]{2,}", text.lower())


def _keyword_score(text: str) -> tuple[float, dict[str, int]]:
    t = " " + _normalize(text) + " "
    hits: dict[str, int] = {}
    score = 0.0
    for kw, weight in _OPTIONS_KEYWORDS.items():
        if " " in kw:
            n = t.count(" " + kw + " ")
        else:
            n = len(re.findall(rf"\b{re.escape(kw)}\b", t))
        if n:
            hits[kw] = n
            score += weight * n
    return score, hits


def _looks_like_toc(text: str) -> bool:
    t = _normalize(text)
    head = t[:400]
    if "table of contents" in head or re.search(r"\bcontents\b", head):
        return True

    lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
    if len(lines) < 10:
        return False

    # Many lines ending with digits and/or dotted leaders
    end_num = sum(1 for ln in lines if re.search(r"(\\.{2,}|\\s)\\d{1,4}$", ln))
    dotted = sum(1 for ln in lines if "..." in ln or re.search(r"\\.{4,}", ln))
    shortish = sum(1 for ln in lines if len(ln) <= 60)

    if end_num / len(lines) >= 0.35 and (dotted + end_num) / len(lines) >= 0.35 and shortish / len(lines) >= 0.5:
        return True
    return False


def _looks_like_index(text: str) -> bool:
    t = _normalize(text)
    head = t[:400]
    if re.search(r"^\\s*index\\b", head):
        return True

    lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
    if len(lines) < 15:
        return False

    indexish = 0
    for ln in lines[:200]:
        if re.search(r"\\b\\d{1,4}(?:,\\s*\\d{1,4}){2,}\\b", ln):
            indexish += 1
            continue
        if re.search(r"^[a-z].{1,60}\\s+\\d{1,4}(?:,\\s*\\d{1,4})+\\b", ln):
            indexish += 1
            continue
    return indexish >= max(10, math.ceil(0.25 * min(len(lines), 200)))


def _looks_like_front_matter(text: str) -> bool:
    t = _normalize(text)
    head = t[:800]
    if any(p in head for p in _JUNK_PHRASES):
        return True
    # Too little prose
    toks = _tokenish(head)
    if len(toks) < 80 and (("isbn" in head) or ("copyright" in head)):
        return True
    return False


def _is_junk(text: str) -> str | None:
    if _looks_like_toc(text):
        return "toc"
    if _looks_like_index(text):
        return "index"
    if _looks_like_front_matter(text):
        return "front_matter"
    return None


def _iter_files(root: Path, include: list[str], exclude: list[str]) -> Iterable[Path]:
    exts = {".pdf", ".epub"}
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in exts:
            path_lower = str(p).lower()
            if include and not any(token in path_lower for token in include):
                continue
            if exclude and any(token in path_lower for token in exclude):
                continue
            yield p


def _extract_pdf_segments(path: Path) -> list[tuple[str, str]]:
    from pypdf import PdfReader

    reader = PdfReader(str(path))
    out: list[tuple[str, str]] = []
    for i, page in enumerate(reader.pages, start=1):
        try:
            txt = page.extract_text() or ""
        except Exception:
            txt = ""
        out.append((f"page:{i}", txt))
    return out


def _extract_epub_segments(path: Path) -> list[tuple[str, str]]:
    from bs4 import BeautifulSoup
    from ebooklib import ITEM_DOCUMENT, epub

    book = epub.read_epub(str(path))
    out: list[tuple[str, str]] = []
    idx = 0
    for item in book.get_items():
        if item.get_type() != ITEM_DOCUMENT:
            continue
        idx += 1
        soup = BeautifulSoup(item.get_body_content(), "lxml")
        txt = soup.get_text("\n", strip=True)
        name = getattr(item, "file_name", None) or f"doc:{idx}"
        out.append((f"section:{name}", txt))
    return out


def main() -> int:
    parser = argparse.ArgumentParser(description="Select option-trading-relevant pages/sections from PDFs/EPUBs.")
    parser.add_argument("--input", type=Path, default=Path("eBooks"))
    parser.add_argument("--out", type=Path, default=Path("training_data/relevant"))
    parser.add_argument("--min-score", type=float, default=10.0)
    parser.add_argument("--front-matter-score", type=float, default=None)
    parser.add_argument("--min-chars", type=int, default=800)
    parser.add_argument("--neighbors", type=int, default=1, help="Include +/- N neighbor pages/sections around hits.")
    parser.add_argument(
        "--include",
        action="append",
        default=[],
        help="Only include files whose path contains this substring (case-insensitive).",
    )
    parser.add_argument(
        "--exclude",
        action="append",
        default=[],
        help="Skip files whose path contains this substring (case-insensitive).",
    )
    args = parser.parse_args()

    out_dir: Path = args.out
    text_dir = out_dir / "text"
    out_dir.mkdir(parents=True, exist_ok=True)
    text_dir.mkdir(parents=True, exist_ok=True)
    front_matter_min_score = args.front_matter_score if args.front_matter_score is not None else args.min_score
    include = [token.lower() for token in args.include]
    exclude = [token.lower() for token in args.exclude]

    seen_hashes: set[str] = set()
    selected: list[dict] = []
    report_rows: list[dict] = []

    for file_path in sorted(_iter_files(args.input, include, exclude)):
        suffix = file_path.suffix.lower()
        if suffix == ".pdf":
            segs = _extract_pdf_segments(file_path)
        elif suffix == ".epub":
            segs = _extract_epub_segments(file_path)
        else:
            continue

        scored: list[tuple[int, str, str, float, dict[str, int], str | None]] = []
        for idx, (loc, txt) in enumerate(segs):
            if not txt or len(txt) < args.min_chars:
                scored.append((idx, loc, txt, 0.0, {}, "too_short"))
                continue
            score, hits = _keyword_score(txt)
            junk = _is_junk(txt)
            scored.append((idx, loc, txt, score, hits, junk))

        keep_indices: set[int] = set()
        for idx, loc, txt, score, hits, junk in scored:
            if junk in {"toc", "index"}:
                continue
            if junk == "front_matter" and score < front_matter_min_score:
                continue
            if score < args.min_score:
                continue
            keep_indices.add(idx)
            for d in range(1, args.neighbors + 1):
                keep_indices.add(idx - d)
                keep_indices.add(idx + d)

        keep_indices = {i for i in keep_indices if 0 <= i < len(scored)}

        for idx, loc, txt, score, hits, junk in scored:
            if idx not in keep_indices:
                continue
            if not txt or len(txt) < args.min_chars:
                continue
            if junk in {"toc", "index"}:
                continue
            # For neighbor pages, allow some front matter, but only if score isn't near-zero
            if junk == "front_matter" and score < front_matter_min_score:
                continue

            norm = _normalize(txt)
            seg_hash = _sha256(norm)
            if seg_hash in seen_hashes:
                continue
            seen_hashes.add(seg_hash)

            (text_dir / f"{seg_hash}.txt").write_text(txt, encoding="utf-8", errors="ignore")

            src = str(file_path)
            primary = f"{src}#{loc}"
            selected.append(
                {
                    "id": seg_hash,
                    "primary": primary,
                    "duplicates": [],
                    "chars": len(txt),
                    "score": score,
                    "hits": hits,
                }
            )
            report_rows.append(
                {
                    "id": seg_hash,
                    "source": src,
                    "locator": loc,
                    "score": f"{score:.2f}",
                    "chars": str(len(txt)),
                    "junk": junk or "",
                    "top_hits": ";".join(sorted(hits.keys())[:12]),
                }
            )

    manifest_path = out_dir / "manifest.json"
    manifest_path.write_text(json.dumps({"docs": selected}, indent=2), encoding="utf-8")

    report_path = out_dir / "report.csv"
    with report_path.open("w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=["id", "source", "locator", "score", "chars", "junk", "top_hits"],
        )
        writer.writeheader()
        writer.writerows(report_rows)

    corpus_path = out_dir / "corpus.txt"
    with corpus_path.open("w", encoding="utf-8", errors="ignore") as f:
        for doc in selected:
            f.write("\n\n" + "=" * 80 + "\n")
            f.write(f"SOURCE: {doc['primary']}\n")
            f.write(f"SCORE: {doc.get('score', 0):.2f}\n")
            f.write("=" * 80 + "\n\n")
            f.write((text_dir / f"{doc['id']}.txt").read_text(encoding="utf-8", errors="ignore"))
            f.write("\n")

    print(f"Selected segments: {len(selected)}")
    print(f"Manifest: {manifest_path}")
    print(f"Report: {report_path}")
    print(f"Corpus: {corpus_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())