Add training workflow, datasets, and runbook
This commit is contained in:
226
tools/build_dataset.py
Normal file
226
tools/build_dataset.py
Normal file
@@ -0,0 +1,226 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
_OPTIONS_KEYWORDS: dict[str, float] = {
|
||||
"option": 2.0,
|
||||
"options": 2.0,
|
||||
"call": 1.0,
|
||||
"put": 1.0,
|
||||
"strike": 2.0,
|
||||
"expiration": 2.0,
|
||||
"expiry": 2.0,
|
||||
"premium": 2.0,
|
||||
"contract": 1.0,
|
||||
"underlying": 2.0,
|
||||
"open interest": 3.0,
|
||||
"bid-ask": 3.0,
|
||||
"bid ask": 3.0,
|
||||
"assignment": 3.0,
|
||||
"exercise": 2.0,
|
||||
"early exercise": 4.0,
|
||||
"delta": 3.0,
|
||||
"gamma": 3.0,
|
||||
"theta": 3.0,
|
||||
"vega": 3.0,
|
||||
"rho": 2.0,
|
||||
"implied volatility": 4.0,
|
||||
"historical volatility": 3.0,
|
||||
"volatility smile": 3.0,
|
||||
"skew": 2.0,
|
||||
"iv": 1.5,
|
||||
"spread": 2.0,
|
||||
"vertical spread": 4.0,
|
||||
"calendar spread": 4.0,
|
||||
"diagonal spread": 4.0,
|
||||
"credit spread": 4.0,
|
||||
"debit spread": 4.0,
|
||||
"iron condor": 5.0,
|
||||
"butterfly": 3.0,
|
||||
"straddle": 4.0,
|
||||
"strangle": 4.0,
|
||||
"covered call": 5.0,
|
||||
"protective put": 5.0,
|
||||
"cash-secured put": 5.0,
|
||||
"ratio spread": 4.0,
|
||||
"intrinsic value": 4.0,
|
||||
"time value": 4.0,
|
||||
"extrinsic value": 4.0,
|
||||
"breakeven": 3.0,
|
||||
"probability of profit": 4.0,
|
||||
"expected value": 3.0,
|
||||
"black-scholes": 5.0,
|
||||
"black scholes": 5.0,
|
||||
"binomial": 3.0,
|
||||
"greeks": 4.0,
|
||||
"margin": 2.0,
|
||||
"reg t": 2.0,
|
||||
"portfolio margin": 4.0,
|
||||
}
|
||||
|
||||
_JUNK_PHRASES = [
|
||||
"all rights reserved",
|
||||
"no part of this publication",
|
||||
"printed in",
|
||||
"publisher",
|
||||
"isbn",
|
||||
"library of congress",
|
||||
"copyright",
|
||||
"acknowledg",
|
||||
"about the author",
|
||||
"disclaimer",
|
||||
"warranty",
|
||||
]
|
||||
|
||||
|
||||
def _fix_drop_caps(text: str) -> str:
|
||||
# Join single-letter drop caps like "O ptions" -> "Options".
|
||||
for _ in range(6):
|
||||
fixed = re.sub(r"\b([A-Za-z])\s+(?=[a-z])", r"\1", text)
|
||||
if fixed == text:
|
||||
break
|
||||
text = fixed
|
||||
return text
|
||||
|
||||
|
||||
def _clean_text(text: str) -> str:
|
||||
text = text.replace("\u00ad", "") # soft hyphen
|
||||
text = text.replace("\u200b", "") # zero-width space
|
||||
text = _fix_drop_caps(text)
|
||||
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
||||
text = re.sub(r"[ \t]+", " ", text)
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _normalize_for_score(text: str) -> str:
|
||||
text = _fix_drop_caps(text)
|
||||
text = text.lower()
|
||||
text = re.sub(r"[ \t]+", " ", text)
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _keyword_score(text: str) -> float:
|
||||
t = " " + _normalize_for_score(text) + " "
|
||||
score = 0.0
|
||||
for kw, weight in _OPTIONS_KEYWORDS.items():
|
||||
if " " in kw:
|
||||
n = t.count(" " + kw + " ")
|
||||
else:
|
||||
n = len(re.findall(rf"\b{re.escape(kw)}\b", t))
|
||||
if n:
|
||||
score += weight * n
|
||||
return score
|
||||
|
||||
|
||||
def _looks_like_junk(text: str) -> bool:
|
||||
head = _normalize_for_score(text)[:800]
|
||||
if "table of contents" in head or re.search(r"\bcontents\b", head):
|
||||
return True
|
||||
if re.search(r"^\s*index\b", head):
|
||||
return True
|
||||
if any(p in head for p in _JUNK_PHRASES):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _chunk_text(text: str, *, chunk_chars: int, overlap_chars: int) -> list[str]:
|
||||
if chunk_chars <= 0:
|
||||
raise ValueError("chunk_chars must be > 0")
|
||||
if overlap_chars < 0:
|
||||
raise ValueError("overlap_chars must be >= 0")
|
||||
if overlap_chars >= chunk_chars:
|
||||
raise ValueError("overlap_chars must be < chunk_chars")
|
||||
|
||||
chunks: list[str] = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = min(start + chunk_chars, len(text))
|
||||
chunk = text[start:end].strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
if end == len(text):
|
||||
break
|
||||
start = end - overlap_chars
|
||||
return chunks
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Build a JSONL dataset from extracted docs.")
|
||||
parser.add_argument("--manifest", type=Path, default=Path("training_data/manifest.json"))
|
||||
parser.add_argument("--text-dir", type=Path, default=Path("training_data/text"))
|
||||
parser.add_argument("--out", type=Path, default=Path("training_data/dataset.jsonl"))
|
||||
parser.add_argument("--chunk-chars", type=int, default=6000)
|
||||
parser.add_argument("--overlap-chars", type=int, default=400)
|
||||
parser.add_argument("--min-chars", type=int, default=1200)
|
||||
parser.add_argument("--min-score", type=float, default=0.0)
|
||||
parser.add_argument("--drop-junk", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
manifest = json.loads(args.manifest.read_text(encoding="utf-8"))
|
||||
docs = manifest.get("docs", [])
|
||||
if not docs:
|
||||
raise SystemExit(f"No docs in manifest: {args.manifest}")
|
||||
|
||||
args.out.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
n_docs = 0
|
||||
n_chunks = 0
|
||||
with args.out.open("w", encoding="utf-8") as f:
|
||||
for doc in docs:
|
||||
doc_id = doc["id"]
|
||||
primary = doc["primary"]
|
||||
txt_path = args.text_dir / f"{doc_id}.txt"
|
||||
if not txt_path.exists():
|
||||
continue
|
||||
raw = txt_path.read_text(encoding="utf-8", errors="ignore")
|
||||
cleaned = _clean_text(raw)
|
||||
if len(cleaned) < args.min_chars:
|
||||
continue
|
||||
|
||||
n_docs += 1
|
||||
chunks = _chunk_text(cleaned, chunk_chars=args.chunk_chars, overlap_chars=args.overlap_chars)
|
||||
for i, chunk in enumerate(chunks):
|
||||
if len(chunk) < args.min_chars:
|
||||
continue
|
||||
if args.drop_junk and _looks_like_junk(chunk):
|
||||
continue
|
||||
if args.min_score > 0 and _keyword_score(chunk) < args.min_score:
|
||||
continue
|
||||
rec = {
|
||||
"text": chunk,
|
||||
"source": primary,
|
||||
"doc_id": doc_id,
|
||||
"chunk_index": i,
|
||||
}
|
||||
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
|
||||
n_chunks += 1
|
||||
|
||||
stats_path = args.out.with_suffix(".stats.json")
|
||||
stats_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"docs_used": n_docs,
|
||||
"chunks_written": n_chunks,
|
||||
"chunk_chars": args.chunk_chars,
|
||||
"overlap_chars": args.overlap_chars,
|
||||
"min_chars": args.min_chars,
|
||||
"min_score": args.min_score,
|
||||
"drop_junk": args.drop_junk,
|
||||
},
|
||||
indent=2,
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
print(f"Wrote {n_chunks} chunks from {n_docs} docs to {args.out}")
|
||||
print(f"Stats: {stats_path}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
173
tools/extract_corpus.py
Normal file
173
tools/extract_corpus.py
Normal file
@@ -0,0 +1,173 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExtractedDoc:
|
||||
source_path: str
|
||||
text: str
|
||||
|
||||
|
||||
def _normalize_for_hash(text: str) -> str:
|
||||
text = text.replace("\u00ad", "") # soft hyphen
|
||||
text = text.replace("\u200b", "") # zero-width space
|
||||
text = text.lower()
|
||||
text = re.sub(r"[ \t]+", " ", text)
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _sha256_text(text: str) -> str:
|
||||
return hashlib.sha256(text.encode("utf-8", errors="ignore")).hexdigest()
|
||||
|
||||
|
||||
def _extract_pdf(path: Path) -> str:
|
||||
from pypdf import PdfReader
|
||||
|
||||
reader = PdfReader(str(path))
|
||||
parts: list[str] = []
|
||||
for page in reader.pages:
|
||||
try:
|
||||
parts.append(page.extract_text() or "")
|
||||
except Exception:
|
||||
parts.append("")
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def _extract_epub(path: Path) -> str:
|
||||
from bs4 import BeautifulSoup
|
||||
from ebooklib import ITEM_DOCUMENT, epub
|
||||
|
||||
book = epub.read_epub(str(path))
|
||||
parts: list[str] = []
|
||||
for item in book.get_items():
|
||||
if item.get_type() != ITEM_DOCUMENT:
|
||||
continue
|
||||
soup = BeautifulSoup(item.get_body_content(), "lxml")
|
||||
parts.append(soup.get_text("\n", strip=True))
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def _read_text_file(path: Path) -> str:
|
||||
import chardet
|
||||
|
||||
raw = path.read_bytes()
|
||||
guess = chardet.detect(raw)
|
||||
encoding = guess.get("encoding") or "utf-8"
|
||||
try:
|
||||
return raw.decode(encoding, errors="replace")
|
||||
except LookupError:
|
||||
return raw.decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
def extract_text(path: Path) -> ExtractedDoc | None:
|
||||
suffix = path.suffix.lower()
|
||||
try:
|
||||
if suffix == ".pdf":
|
||||
return ExtractedDoc(str(path), _extract_pdf(path))
|
||||
if suffix == ".epub":
|
||||
return ExtractedDoc(str(path), _extract_epub(path))
|
||||
if suffix in {".txt", ".md"}:
|
||||
return ExtractedDoc(str(path), _read_text_file(path))
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def iter_candidate_files(root: Path) -> Iterable[Path]:
|
||||
exts = {".pdf", ".epub", ".txt", ".md"}
|
||||
for path in root.rglob("*"):
|
||||
if not path.is_file():
|
||||
continue
|
||||
if path.suffix.lower() not in exts:
|
||||
continue
|
||||
yield path
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Extract and dedupe local documents into a plain-text corpus.")
|
||||
parser.add_argument("--input", type=Path, default=Path("eBooks"), help="Input directory to scan (default: eBooks).")
|
||||
parser.add_argument(
|
||||
"--out",
|
||||
type=Path,
|
||||
default=Path("training_data"),
|
||||
help="Output directory (default: training_data).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-chars",
|
||||
type=int,
|
||||
default=2000,
|
||||
help="Skip extracted docs shorter than this (default: 2000).",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
in_dir: Path = args.input
|
||||
out_dir: Path = args.out
|
||||
out_text_dir = out_dir / "text"
|
||||
out_text_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
manifest_path = out_dir / "manifest.json"
|
||||
corpus_path = out_dir / "corpus.txt"
|
||||
rejected_path = out_dir / "rejected.json"
|
||||
|
||||
docs: dict[str, dict] = {}
|
||||
rejected: list[dict] = []
|
||||
seen_hashes: set[str] = set()
|
||||
|
||||
candidates = sorted(iter_candidate_files(in_dir))
|
||||
for file_path in candidates:
|
||||
extracted = extract_text(file_path)
|
||||
if extracted is None:
|
||||
rejected.append({"path": str(file_path), "reason": "extract_failed"})
|
||||
continue
|
||||
|
||||
normalized = _normalize_for_hash(extracted.text)
|
||||
if len(normalized) < args.min_chars:
|
||||
rejected.append({"path": str(file_path), "reason": "too_short"})
|
||||
continue
|
||||
|
||||
doc_hash = _sha256_text(normalized)
|
||||
if doc_hash in seen_hashes:
|
||||
docs[doc_hash]["duplicates"].append(str(file_path))
|
||||
continue
|
||||
|
||||
seen_hashes.add(doc_hash)
|
||||
out_txt = out_text_dir / f"{doc_hash}.txt"
|
||||
out_txt.write_text(extracted.text, encoding="utf-8", errors="ignore")
|
||||
|
||||
docs[doc_hash] = {
|
||||
"id": doc_hash,
|
||||
"primary": str(file_path),
|
||||
"duplicates": [],
|
||||
"chars": len(extracted.text),
|
||||
}
|
||||
|
||||
manifest_path.write_text(json.dumps({"docs": list(docs.values())}, indent=2), encoding="utf-8")
|
||||
rejected_path.write_text(json.dumps(rejected, indent=2), encoding="utf-8")
|
||||
|
||||
# Build concatenated corpus
|
||||
with corpus_path.open("w", encoding="utf-8", errors="ignore") as f:
|
||||
for doc in docs.values():
|
||||
f.write("\n\n" + "=" * 80 + "\n")
|
||||
f.write(f"SOURCE: {doc['primary']}\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
f.write((out_text_dir / f"{doc['id']}.txt").read_text(encoding="utf-8", errors="ignore"))
|
||||
f.write("\n")
|
||||
|
||||
print(f"Extracted unique docs: {len(docs)}")
|
||||
print(f"Wrote corpus: {corpus_path}")
|
||||
print(f"Manifest: {manifest_path}")
|
||||
if rejected:
|
||||
print(f"Rejected: {len(rejected)} (see {rejected_path})")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
230
tools/finetune_lora.py
Normal file
230
tools/finetune_lora.py
Normal file
@@ -0,0 +1,230 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="LoRA fine-tune gpt-oss-20b on a local JSONL text corpus.")
|
||||
parser.add_argument("--model", default="openai/gpt-oss-20b")
|
||||
parser.add_argument("--data", type=Path, default=Path("training_data/relevant/dataset.jsonl"))
|
||||
parser.add_argument("--out", type=Path, default=Path("training_data/lora_adapter"))
|
||||
parser.add_argument("--max-length", type=int, default=256)
|
||||
parser.add_argument("--epochs", type=int, default=1)
|
||||
parser.add_argument("--max-steps", type=int, default=0, help="If >0, stop after this many optimizer steps.")
|
||||
parser.add_argument("--lr", type=float, default=2e-4)
|
||||
parser.add_argument("--seed", type=int, default=42)
|
||||
parser.add_argument("--lora-r", type=int, default=8)
|
||||
parser.add_argument("--lora-alpha", type=int, default=16)
|
||||
parser.add_argument("--lora-dropout", type=float, default=0.05)
|
||||
parser.add_argument("--grad-accum", type=int, default=4)
|
||||
parser.add_argument("--device", default="auto")
|
||||
parser.add_argument("--device-map", choices=["auto", "cuda"], default="auto")
|
||||
parser.add_argument("--cpu-offload", action="store_true")
|
||||
parser.add_argument("--max-gpu-mem", default=None, help="Max GPU memory for device_map=auto, e.g. 10GiB")
|
||||
parser.add_argument("--max-cpu-mem", default=None, help="Max CPU memory for device_map=auto, e.g. 64GiB")
|
||||
parser.add_argument("--quant", choices=["auto", "none", "4bit"], default="auto")
|
||||
parser.add_argument("--log-steps", type=int, default=10)
|
||||
parser.add_argument("--log-seconds", type=int, default=120)
|
||||
parser.add_argument("--local-files-only", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
# Reduce noisy parallelism; avoid oversubscribing if user has many cores.
|
||||
if "OMP_NUM_THREADS" not in os.environ:
|
||||
os.environ["OMP_NUM_THREADS"] = str(max(1, (os.cpu_count() or 8) // 2))
|
||||
|
||||
args.out.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
lines = args.data.read_text(encoding="utf-8", errors="ignore").splitlines()
|
||||
records = [json.loads(ln) for ln in lines if ln.strip()]
|
||||
texts = [r.get("text", "") for r in records if isinstance(r, dict) and r.get("text")]
|
||||
if not texts:
|
||||
raise SystemExit(f"No text records found in {args.data}")
|
||||
|
||||
print(f"Loaded {len(texts)} training samples from {args.data}")
|
||||
|
||||
if args.device == "auto":
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
else:
|
||||
device = torch.device(args.device)
|
||||
|
||||
if device.type == "cuda":
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
|
||||
if args.quant in {"auto", "4bit"} and device.type != "cuda":
|
||||
raise SystemExit("Quantized loading requires CUDA. Use --quant none for CPU.")
|
||||
|
||||
print("Loading tokenizer...")
|
||||
tok = AutoTokenizer.from_pretrained(args.model, local_files_only=args.local_files_only)
|
||||
if tok.pad_token is None:
|
||||
tok.pad_token = tok.eos_token
|
||||
|
||||
print("Loading model...")
|
||||
config = AutoConfig.from_pretrained(args.model, local_files_only=args.local_files_only)
|
||||
has_quant_attr = hasattr(config, "quantization_config")
|
||||
|
||||
if args.quant == "auto":
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
args.model,
|
||||
local_files_only=args.local_files_only,
|
||||
device_map="auto",
|
||||
)
|
||||
elif args.quant == "4bit":
|
||||
compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_compute_dtype=compute_dtype,
|
||||
llm_int8_enable_fp32_cpu_offload=args.cpu_offload,
|
||||
)
|
||||
if has_quant_attr:
|
||||
delattr(config, "quantization_config")
|
||||
device_map = {"": 0} if args.device_map == "cuda" else "auto"
|
||||
load_kwargs = dict(
|
||||
local_files_only=args.local_files_only,
|
||||
config=config,
|
||||
device_map=device_map,
|
||||
torch_dtype=compute_dtype,
|
||||
)
|
||||
if device_map == "auto" and (args.max_gpu_mem or args.max_cpu_mem):
|
||||
max_memory = {}
|
||||
if args.max_gpu_mem:
|
||||
max_memory[0] = args.max_gpu_mem
|
||||
if args.max_cpu_mem:
|
||||
max_memory["cpu"] = args.max_cpu_mem
|
||||
load_kwargs["max_memory"] = max_memory
|
||||
load_kwargs["quantization_config"] = bnb_config
|
||||
model = AutoModelForCausalLM.from_pretrained(args.model, **load_kwargs)
|
||||
model = prepare_model_for_kbit_training(model)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
args.model,
|
||||
local_files_only=args.local_files_only,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map={"": "cpu" if device.type == "cpu" else 0},
|
||||
low_cpu_mem_usage=True,
|
||||
)
|
||||
model.to(device)
|
||||
|
||||
lora_cfg = LoraConfig(
|
||||
r=args.lora_r,
|
||||
lora_alpha=args.lora_alpha,
|
||||
lora_dropout=args.lora_dropout,
|
||||
bias="none",
|
||||
task_type="CAUSAL_LM",
|
||||
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
|
||||
)
|
||||
model = get_peft_model(model, lora_cfg)
|
||||
model.print_trainable_parameters()
|
||||
model.config.use_cache = False
|
||||
if device.type == "cuda":
|
||||
model.gradient_checkpointing_enable()
|
||||
# Needed for PEFT + gradient checkpointing to ensure grads flow to LoRA params.
|
||||
model.enable_input_require_grads()
|
||||
|
||||
trainable = [p for p in model.parameters() if p.requires_grad]
|
||||
opt = torch.optim.AdamW(trainable, lr=args.lr)
|
||||
|
||||
# Training loop
|
||||
model.train()
|
||||
total_opt_steps = 0
|
||||
total_batches = 0
|
||||
loss_ema: float | None = None
|
||||
last_log = time.time()
|
||||
accum_steps = 0
|
||||
|
||||
for epoch in range(1, args.epochs + 1):
|
||||
order = list(range(len(texts)))
|
||||
random.shuffle(order)
|
||||
|
||||
for i in order:
|
||||
text = texts[i]
|
||||
batch = tok(text, return_tensors="pt", truncation=True, max_length=args.max_length)
|
||||
if batch["input_ids"].numel() < 32:
|
||||
continue
|
||||
batch["labels"] = batch["input_ids"].clone()
|
||||
batch = {k: v.to(device) for k, v in batch.items()}
|
||||
|
||||
t0 = time.time()
|
||||
out = model(**batch)
|
||||
loss = out.loss / max(1, args.grad_accum)
|
||||
loss.backward()
|
||||
accum_steps += 1
|
||||
if accum_steps >= max(1, args.grad_accum):
|
||||
torch.nn.utils.clip_grad_norm_(trainable, 1.0)
|
||||
opt.step()
|
||||
opt.zero_grad(set_to_none=True)
|
||||
total_opt_steps += 1
|
||||
accum_steps = 0
|
||||
dt = time.time() - t0
|
||||
|
||||
total_batches += 1
|
||||
|
||||
lv = float(loss.detach().cpu().item()) * max(1, args.grad_accum)
|
||||
loss_ema = lv if loss_ema is None else (0.95 * loss_ema + 0.05 * lv)
|
||||
|
||||
if (args.log_steps and total_opt_steps % args.log_steps == 0) or (
|
||||
args.log_seconds and time.time() - last_log >= args.log_seconds
|
||||
):
|
||||
tok_count = int(batch["input_ids"].numel())
|
||||
print(
|
||||
f"epoch {epoch}/{args.epochs} step {total_opt_steps} "
|
||||
f"loss {lv:.4f} ema {loss_ema:.4f} "
|
||||
f"{dt:.2f}s {tok_count} tokens"
|
||||
)
|
||||
last_log = time.time()
|
||||
|
||||
if args.max_steps and total_opt_steps >= args.max_steps:
|
||||
break
|
||||
|
||||
if accum_steps:
|
||||
torch.nn.utils.clip_grad_norm_(trainable, 1.0)
|
||||
opt.step()
|
||||
opt.zero_grad(set_to_none=True)
|
||||
total_opt_steps += 1
|
||||
accum_steps = 0
|
||||
if args.max_steps and total_opt_steps >= args.max_steps:
|
||||
break
|
||||
|
||||
if args.max_steps and total_opt_steps >= args.max_steps:
|
||||
break
|
||||
|
||||
print(f"Saving adapter to {args.out} ...")
|
||||
model.save_pretrained(args.out)
|
||||
tok.save_pretrained(args.out)
|
||||
|
||||
summary = {
|
||||
"model": args.model,
|
||||
"data": str(args.data),
|
||||
"out": str(args.out),
|
||||
"max_length": args.max_length,
|
||||
"epochs": args.epochs,
|
||||
"max_steps": args.max_steps,
|
||||
"lr": args.lr,
|
||||
"lora_r": args.lora_r,
|
||||
"lora_alpha": args.lora_alpha,
|
||||
"lora_dropout": args.lora_dropout,
|
||||
"optimizer_steps": total_opt_steps,
|
||||
"loss_ema": loss_ema,
|
||||
}
|
||||
(args.out / "training_summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")
|
||||
print("Done.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
387
tools/select_relevant.py
Normal file
387
tools/select_relevant.py
Normal file
@@ -0,0 +1,387 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import hashlib
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
|
||||
_OPTIONS_KEYWORDS: dict[str, float] = {
|
||||
# core
|
||||
"option": 2.0,
|
||||
"options": 2.0,
|
||||
"call": 1.0,
|
||||
"put": 1.0,
|
||||
"strike": 2.0,
|
||||
"expiration": 2.0,
|
||||
"expiry": 2.0,
|
||||
"premium": 2.0,
|
||||
"contract": 1.0,
|
||||
"underlying": 2.0,
|
||||
"open interest": 3.0,
|
||||
"bid-ask": 3.0,
|
||||
"bid ask": 3.0,
|
||||
"assignment": 3.0,
|
||||
"exercise": 2.0,
|
||||
"early exercise": 4.0,
|
||||
# greeks / vol
|
||||
"delta": 3.0,
|
||||
"gamma": 3.0,
|
||||
"theta": 3.0,
|
||||
"vega": 3.0,
|
||||
"rho": 2.0,
|
||||
"implied volatility": 4.0,
|
||||
"historical volatility": 3.0,
|
||||
"volatility smile": 3.0,
|
||||
"skew": 2.0,
|
||||
"iv": 1.5,
|
||||
# strategies
|
||||
"spread": 2.0,
|
||||
"vertical spread": 4.0,
|
||||
"calendar spread": 4.0,
|
||||
"diagonal spread": 4.0,
|
||||
"credit spread": 4.0,
|
||||
"debit spread": 4.0,
|
||||
"iron condor": 5.0,
|
||||
"butterfly": 3.0,
|
||||
"straddle": 4.0,
|
||||
"strangle": 4.0,
|
||||
"covered call": 5.0,
|
||||
"protective put": 5.0,
|
||||
"cash-secured put": 5.0,
|
||||
"ratio spread": 4.0,
|
||||
# risk / pricing
|
||||
"intrinsic value": 4.0,
|
||||
"time value": 4.0,
|
||||
"extrinsic value": 4.0,
|
||||
"breakeven": 3.0,
|
||||
"probability of profit": 4.0,
|
||||
"expected value": 3.0,
|
||||
"black-scholes": 5.0,
|
||||
"black scholes": 5.0,
|
||||
"binomial": 3.0,
|
||||
"greeks": 4.0,
|
||||
"margin": 2.0,
|
||||
"reg t": 2.0,
|
||||
"portfolio margin": 4.0,
|
||||
}
|
||||
|
||||
_JUNK_PHRASES = [
|
||||
"all rights reserved",
|
||||
"no part of this publication",
|
||||
"printed in",
|
||||
"publisher",
|
||||
"isbn",
|
||||
"library of congress",
|
||||
"copyright",
|
||||
"acknowledg",
|
||||
"about the author",
|
||||
"disclaimer",
|
||||
"warranty",
|
||||
]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Segment:
|
||||
source_path: str
|
||||
locator: str # "page:123" or "section:foo"
|
||||
text: str
|
||||
score: float
|
||||
|
||||
|
||||
def _fix_drop_caps(text: str) -> str:
|
||||
# Join single-letter drop caps like "O ptions" -> "Options".
|
||||
for _ in range(6):
|
||||
fixed = re.sub(r"\b([A-Za-z])\s+(?=[a-z])", r"\1", text)
|
||||
if fixed == text:
|
||||
break
|
||||
text = fixed
|
||||
return text
|
||||
|
||||
|
||||
def _normalize(text: str) -> str:
|
||||
text = text.replace("\u00ad", "") # soft hyphen
|
||||
text = text.replace("\u200b", "") # zero-width space
|
||||
text = _fix_drop_caps(text)
|
||||
text = text.lower()
|
||||
text = re.sub(r"[ \t]+", " ", text)
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _sha256(text: str) -> str:
|
||||
return hashlib.sha256(text.encode("utf-8", errors="ignore")).hexdigest()
|
||||
|
||||
|
||||
def _tokenish(text: str) -> list[str]:
|
||||
return re.findall(r"[a-z]{2,}", text.lower())
|
||||
|
||||
|
||||
def _keyword_score(text: str) -> tuple[float, dict[str, int]]:
|
||||
t = " " + _normalize(text) + " "
|
||||
hits: dict[str, int] = {}
|
||||
score = 0.0
|
||||
for kw, weight in _OPTIONS_KEYWORDS.items():
|
||||
if " " in kw:
|
||||
n = t.count(" " + kw + " ")
|
||||
else:
|
||||
n = len(re.findall(rf"\b{re.escape(kw)}\b", t))
|
||||
if n:
|
||||
hits[kw] = n
|
||||
score += weight * n
|
||||
return score, hits
|
||||
|
||||
|
||||
def _looks_like_toc(text: str) -> bool:
|
||||
t = _normalize(text)
|
||||
head = t[:400]
|
||||
if "table of contents" in head or re.search(r"\bcontents\b", head):
|
||||
return True
|
||||
|
||||
lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
|
||||
if len(lines) < 10:
|
||||
return False
|
||||
|
||||
# Many lines ending with digits and/or dotted leaders
|
||||
end_num = sum(1 for ln in lines if re.search(r"(\\.{2,}|\\s)\\d{1,4}$", ln))
|
||||
dotted = sum(1 for ln in lines if "..." in ln or re.search(r"\\.{4,}", ln))
|
||||
shortish = sum(1 for ln in lines if len(ln) <= 60)
|
||||
|
||||
if end_num / len(lines) >= 0.35 and (dotted + end_num) / len(lines) >= 0.35 and shortish / len(lines) >= 0.5:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _looks_like_index(text: str) -> bool:
|
||||
t = _normalize(text)
|
||||
head = t[:400]
|
||||
if re.search(r"^\\s*index\\b", head):
|
||||
return True
|
||||
|
||||
lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
|
||||
if len(lines) < 15:
|
||||
return False
|
||||
|
||||
indexish = 0
|
||||
for ln in lines[:200]:
|
||||
if re.search(r"\\b\\d{1,4}(?:,\\s*\\d{1,4}){2,}\\b", ln):
|
||||
indexish += 1
|
||||
continue
|
||||
if re.search(r"^[a-z].{1,60}\\s+\\d{1,4}(?:,\\s*\\d{1,4})+\\b", ln):
|
||||
indexish += 1
|
||||
continue
|
||||
return indexish >= max(10, math.ceil(0.25 * min(len(lines), 200)))
|
||||
|
||||
|
||||
def _looks_like_front_matter(text: str) -> bool:
|
||||
t = _normalize(text)
|
||||
head = t[:800]
|
||||
if any(p in head for p in _JUNK_PHRASES):
|
||||
return True
|
||||
# Too little prose
|
||||
toks = _tokenish(head)
|
||||
if len(toks) < 80 and (("isbn" in head) or ("copyright" in head)):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _is_junk(text: str) -> str | None:
|
||||
if _looks_like_toc(text):
|
||||
return "toc"
|
||||
if _looks_like_index(text):
|
||||
return "index"
|
||||
if _looks_like_front_matter(text):
|
||||
return "front_matter"
|
||||
return None
|
||||
|
||||
|
||||
def _iter_files(root: Path, include: list[str], exclude: list[str]) -> Iterable[Path]:
|
||||
exts = {".pdf", ".epub"}
|
||||
for p in root.rglob("*"):
|
||||
if p.is_file() and p.suffix.lower() in exts:
|
||||
path_lower = str(p).lower()
|
||||
if include and not any(token in path_lower for token in include):
|
||||
continue
|
||||
if exclude and any(token in path_lower for token in exclude):
|
||||
continue
|
||||
yield p
|
||||
|
||||
|
||||
def _extract_pdf_segments(path: Path) -> list[tuple[str, str]]:
|
||||
from pypdf import PdfReader
|
||||
|
||||
reader = PdfReader(str(path))
|
||||
out: list[tuple[str, str]] = []
|
||||
for i, page in enumerate(reader.pages, start=1):
|
||||
try:
|
||||
txt = page.extract_text() or ""
|
||||
except Exception:
|
||||
txt = ""
|
||||
out.append((f"page:{i}", txt))
|
||||
return out
|
||||
|
||||
|
||||
def _extract_epub_segments(path: Path) -> list[tuple[str, str]]:
|
||||
from bs4 import BeautifulSoup
|
||||
from ebooklib import ITEM_DOCUMENT, epub
|
||||
|
||||
book = epub.read_epub(str(path))
|
||||
out: list[tuple[str, str]] = []
|
||||
idx = 0
|
||||
for item in book.get_items():
|
||||
if item.get_type() != ITEM_DOCUMENT:
|
||||
continue
|
||||
idx += 1
|
||||
soup = BeautifulSoup(item.get_body_content(), "lxml")
|
||||
txt = soup.get_text("\n", strip=True)
|
||||
name = getattr(item, "file_name", None) or f"doc:{idx}"
|
||||
out.append((f"section:{name}", txt))
|
||||
return out
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Select option-trading-relevant pages/sections from PDFs/EPUBs.")
|
||||
parser.add_argument("--input", type=Path, default=Path("eBooks"))
|
||||
parser.add_argument("--out", type=Path, default=Path("training_data/relevant"))
|
||||
parser.add_argument("--min-score", type=float, default=10.0)
|
||||
parser.add_argument("--front-matter-score", type=float, default=None)
|
||||
parser.add_argument("--min-chars", type=int, default=800)
|
||||
parser.add_argument("--neighbors", type=int, default=1, help="Include +/- N neighbor pages/sections around hits.")
|
||||
parser.add_argument(
|
||||
"--include",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Only include files whose path contains this substring (case-insensitive).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--exclude",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Skip files whose path contains this substring (case-insensitive).",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
out_dir: Path = args.out
|
||||
text_dir = out_dir / "text"
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
text_dir.mkdir(parents=True, exist_ok=True)
|
||||
front_matter_min_score = args.front_matter_score if args.front_matter_score is not None else args.min_score
|
||||
include = [token.lower() for token in args.include]
|
||||
exclude = [token.lower() for token in args.exclude]
|
||||
|
||||
seen_hashes: set[str] = set()
|
||||
selected: list[dict] = []
|
||||
report_rows: list[dict] = []
|
||||
|
||||
for file_path in sorted(_iter_files(args.input, include, exclude)):
|
||||
suffix = file_path.suffix.lower()
|
||||
if suffix == ".pdf":
|
||||
segs = _extract_pdf_segments(file_path)
|
||||
elif suffix == ".epub":
|
||||
segs = _extract_epub_segments(file_path)
|
||||
else:
|
||||
continue
|
||||
|
||||
scored: list[tuple[int, str, str, float, dict[str, int], str | None]] = []
|
||||
for idx, (loc, txt) in enumerate(segs):
|
||||
if not txt or len(txt) < args.min_chars:
|
||||
scored.append((idx, loc, txt, 0.0, {}, "too_short"))
|
||||
continue
|
||||
score, hits = _keyword_score(txt)
|
||||
junk = _is_junk(txt)
|
||||
scored.append((idx, loc, txt, score, hits, junk))
|
||||
|
||||
keep_indices: set[int] = set()
|
||||
for idx, loc, txt, score, hits, junk in scored:
|
||||
if junk in {"toc", "index"}:
|
||||
continue
|
||||
if junk == "front_matter" and score < front_matter_min_score:
|
||||
continue
|
||||
if score < args.min_score:
|
||||
continue
|
||||
keep_indices.add(idx)
|
||||
for d in range(1, args.neighbors + 1):
|
||||
keep_indices.add(idx - d)
|
||||
keep_indices.add(idx + d)
|
||||
|
||||
keep_indices = {i for i in keep_indices if 0 <= i < len(scored)}
|
||||
|
||||
for idx, loc, txt, score, hits, junk in scored:
|
||||
if idx not in keep_indices:
|
||||
continue
|
||||
if not txt or len(txt) < args.min_chars:
|
||||
continue
|
||||
if junk in {"toc", "index"}:
|
||||
continue
|
||||
# For neighbor pages, allow some front matter, but only if score isn't near-zero
|
||||
if junk == "front_matter" and score < front_matter_min_score:
|
||||
continue
|
||||
|
||||
norm = _normalize(txt)
|
||||
seg_hash = _sha256(norm)
|
||||
if seg_hash in seen_hashes:
|
||||
continue
|
||||
seen_hashes.add(seg_hash)
|
||||
|
||||
(text_dir / f"{seg_hash}.txt").write_text(txt, encoding="utf-8", errors="ignore")
|
||||
|
||||
src = str(file_path)
|
||||
primary = f"{src}#{loc}"
|
||||
selected.append(
|
||||
{
|
||||
"id": seg_hash,
|
||||
"primary": primary,
|
||||
"duplicates": [],
|
||||
"chars": len(txt),
|
||||
"score": score,
|
||||
"hits": hits,
|
||||
}
|
||||
)
|
||||
report_rows.append(
|
||||
{
|
||||
"id": seg_hash,
|
||||
"source": src,
|
||||
"locator": loc,
|
||||
"score": f"{score:.2f}",
|
||||
"chars": str(len(txt)),
|
||||
"junk": junk or "",
|
||||
"top_hits": ";".join(sorted(hits.keys())[:12]),
|
||||
}
|
||||
)
|
||||
|
||||
manifest_path = out_dir / "manifest.json"
|
||||
manifest_path.write_text(json.dumps({"docs": selected}, indent=2), encoding="utf-8")
|
||||
|
||||
report_path = out_dir / "report.csv"
|
||||
with report_path.open("w", encoding="utf-8", newline="") as f:
|
||||
writer = csv.DictWriter(
|
||||
f,
|
||||
fieldnames=["id", "source", "locator", "score", "chars", "junk", "top_hits"],
|
||||
)
|
||||
writer.writeheader()
|
||||
writer.writerows(report_rows)
|
||||
|
||||
corpus_path = out_dir / "corpus.txt"
|
||||
with corpus_path.open("w", encoding="utf-8", errors="ignore") as f:
|
||||
for doc in selected:
|
||||
f.write("\n\n" + "=" * 80 + "\n")
|
||||
f.write(f"SOURCE: {doc['primary']}\n")
|
||||
f.write(f"SCORE: {doc.get('score', 0):.2f}\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
f.write((text_dir / f"{doc['id']}.txt").read_text(encoding="utf-8", errors="ignore"))
|
||||
f.write("\n")
|
||||
|
||||
print(f"Selected segments: {len(selected)}")
|
||||
print(f"Manifest: {manifest_path}")
|
||||
print(f"Report: {report_path}")
|
||||
print(f"Corpus: {corpus_path}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user