227 lines
6.5 KiB
Python
227 lines
6.5 KiB
Python
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
|
|
_OPTIONS_KEYWORDS: dict[str, float] = {
|
|
"option": 2.0,
|
|
"options": 2.0,
|
|
"call": 1.0,
|
|
"put": 1.0,
|
|
"strike": 2.0,
|
|
"expiration": 2.0,
|
|
"expiry": 2.0,
|
|
"premium": 2.0,
|
|
"contract": 1.0,
|
|
"underlying": 2.0,
|
|
"open interest": 3.0,
|
|
"bid-ask": 3.0,
|
|
"bid ask": 3.0,
|
|
"assignment": 3.0,
|
|
"exercise": 2.0,
|
|
"early exercise": 4.0,
|
|
"delta": 3.0,
|
|
"gamma": 3.0,
|
|
"theta": 3.0,
|
|
"vega": 3.0,
|
|
"rho": 2.0,
|
|
"implied volatility": 4.0,
|
|
"historical volatility": 3.0,
|
|
"volatility smile": 3.0,
|
|
"skew": 2.0,
|
|
"iv": 1.5,
|
|
"spread": 2.0,
|
|
"vertical spread": 4.0,
|
|
"calendar spread": 4.0,
|
|
"diagonal spread": 4.0,
|
|
"credit spread": 4.0,
|
|
"debit spread": 4.0,
|
|
"iron condor": 5.0,
|
|
"butterfly": 3.0,
|
|
"straddle": 4.0,
|
|
"strangle": 4.0,
|
|
"covered call": 5.0,
|
|
"protective put": 5.0,
|
|
"cash-secured put": 5.0,
|
|
"ratio spread": 4.0,
|
|
"intrinsic value": 4.0,
|
|
"time value": 4.0,
|
|
"extrinsic value": 4.0,
|
|
"breakeven": 3.0,
|
|
"probability of profit": 4.0,
|
|
"expected value": 3.0,
|
|
"black-scholes": 5.0,
|
|
"black scholes": 5.0,
|
|
"binomial": 3.0,
|
|
"greeks": 4.0,
|
|
"margin": 2.0,
|
|
"reg t": 2.0,
|
|
"portfolio margin": 4.0,
|
|
}
|
|
|
|
_JUNK_PHRASES = [
|
|
"all rights reserved",
|
|
"no part of this publication",
|
|
"printed in",
|
|
"publisher",
|
|
"isbn",
|
|
"library of congress",
|
|
"copyright",
|
|
"acknowledg",
|
|
"about the author",
|
|
"disclaimer",
|
|
"warranty",
|
|
]
|
|
|
|
|
|
def _fix_drop_caps(text: str) -> str:
|
|
# Join single-letter drop caps like "O ptions" -> "Options".
|
|
for _ in range(6):
|
|
fixed = re.sub(r"\b([A-Za-z])\s+(?=[a-z])", r"\1", text)
|
|
if fixed == text:
|
|
break
|
|
text = fixed
|
|
return text
|
|
|
|
|
|
def _clean_text(text: str) -> str:
|
|
text = text.replace("\u00ad", "") # soft hyphen
|
|
text = text.replace("\u200b", "") # zero-width space
|
|
text = _fix_drop_caps(text)
|
|
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
|
text = re.sub(r"[ \t]+", " ", text)
|
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
return text.strip()
|
|
|
|
|
|
def _normalize_for_score(text: str) -> str:
|
|
text = _fix_drop_caps(text)
|
|
text = text.lower()
|
|
text = re.sub(r"[ \t]+", " ", text)
|
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
return text.strip()
|
|
|
|
|
|
def _keyword_score(text: str) -> float:
|
|
t = " " + _normalize_for_score(text) + " "
|
|
score = 0.0
|
|
for kw, weight in _OPTIONS_KEYWORDS.items():
|
|
if " " in kw:
|
|
n = t.count(" " + kw + " ")
|
|
else:
|
|
n = len(re.findall(rf"\b{re.escape(kw)}\b", t))
|
|
if n:
|
|
score += weight * n
|
|
return score
|
|
|
|
|
|
def _looks_like_junk(text: str) -> bool:
|
|
head = _normalize_for_score(text)[:800]
|
|
if "table of contents" in head or re.search(r"\bcontents\b", head):
|
|
return True
|
|
if re.search(r"^\s*index\b", head):
|
|
return True
|
|
if any(p in head for p in _JUNK_PHRASES):
|
|
return True
|
|
return False
|
|
|
|
|
|
def _chunk_text(text: str, *, chunk_chars: int, overlap_chars: int) -> list[str]:
|
|
if chunk_chars <= 0:
|
|
raise ValueError("chunk_chars must be > 0")
|
|
if overlap_chars < 0:
|
|
raise ValueError("overlap_chars must be >= 0")
|
|
if overlap_chars >= chunk_chars:
|
|
raise ValueError("overlap_chars must be < chunk_chars")
|
|
|
|
chunks: list[str] = []
|
|
start = 0
|
|
while start < len(text):
|
|
end = min(start + chunk_chars, len(text))
|
|
chunk = text[start:end].strip()
|
|
if chunk:
|
|
chunks.append(chunk)
|
|
if end == len(text):
|
|
break
|
|
start = end - overlap_chars
|
|
return chunks
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="Build a JSONL dataset from extracted docs.")
|
|
parser.add_argument("--manifest", type=Path, default=Path("training_data/manifest.json"))
|
|
parser.add_argument("--text-dir", type=Path, default=Path("training_data/text"))
|
|
parser.add_argument("--out", type=Path, default=Path("training_data/dataset.jsonl"))
|
|
parser.add_argument("--chunk-chars", type=int, default=6000)
|
|
parser.add_argument("--overlap-chars", type=int, default=400)
|
|
parser.add_argument("--min-chars", type=int, default=1200)
|
|
parser.add_argument("--min-score", type=float, default=0.0)
|
|
parser.add_argument("--drop-junk", action="store_true")
|
|
args = parser.parse_args()
|
|
|
|
manifest = json.loads(args.manifest.read_text(encoding="utf-8"))
|
|
docs = manifest.get("docs", [])
|
|
if not docs:
|
|
raise SystemExit(f"No docs in manifest: {args.manifest}")
|
|
|
|
args.out.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
n_docs = 0
|
|
n_chunks = 0
|
|
with args.out.open("w", encoding="utf-8") as f:
|
|
for doc in docs:
|
|
doc_id = doc["id"]
|
|
primary = doc["primary"]
|
|
txt_path = args.text_dir / f"{doc_id}.txt"
|
|
if not txt_path.exists():
|
|
continue
|
|
raw = txt_path.read_text(encoding="utf-8", errors="ignore")
|
|
cleaned = _clean_text(raw)
|
|
if len(cleaned) < args.min_chars:
|
|
continue
|
|
|
|
n_docs += 1
|
|
chunks = _chunk_text(cleaned, chunk_chars=args.chunk_chars, overlap_chars=args.overlap_chars)
|
|
for i, chunk in enumerate(chunks):
|
|
if len(chunk) < args.min_chars:
|
|
continue
|
|
if args.drop_junk and _looks_like_junk(chunk):
|
|
continue
|
|
if args.min_score > 0 and _keyword_score(chunk) < args.min_score:
|
|
continue
|
|
rec = {
|
|
"text": chunk,
|
|
"source": primary,
|
|
"doc_id": doc_id,
|
|
"chunk_index": i,
|
|
}
|
|
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
|
|
n_chunks += 1
|
|
|
|
stats_path = args.out.with_suffix(".stats.json")
|
|
stats_path.write_text(
|
|
json.dumps(
|
|
{
|
|
"docs_used": n_docs,
|
|
"chunks_written": n_chunks,
|
|
"chunk_chars": args.chunk_chars,
|
|
"overlap_chars": args.overlap_chars,
|
|
"min_chars": args.min_chars,
|
|
"min_score": args.min_score,
|
|
"drop_junk": args.drop_junk,
|
|
},
|
|
indent=2,
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
print(f"Wrote {n_chunks} chunks from {n_docs} docs to {args.out}")
|
|
print(f"Stats: {stats_path}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|