Feature: Merge Truth Social scraper logic into SimpleScraper

Fix: Remove default MSFT symbol to prevent silent data errors
Prune profile payload for options thesis
2026-01-09 18:30:39 -08:00 · 2026-01-09 17:32:10 -08:00 · 2025-12-29 12:27:30 -08:00 · 2025-12-29 00:45:13 -08:00 · 2025-12-28 12:19:53 -08:00 · 2025-12-28 11:51:06 -08:00
5 changed files with 2278 additions and 543 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
--- a/4
+++ b/4
@@ -1,4 +1,4 @@
-FROM mcr.microsoft.com/playwright/python:v1.50.0-jammy
+FROM mcr.microsoft.com/playwright/python:v1.57.0-jammy

 WORKDIR /app

@@ -6,7 +6,7 @@ ENV PYTHONUNBUFFERED=1

 COPY scraper_service.py /app/scraper_service.py

-RUN python -m pip install --no-cache-dir flask beautifulsoup4
+RUN python -m pip install --no-cache-dir flask beautifulsoup4 playwright==1.57.0

 EXPOSE 9777

--- a/scraper_service.py
+++ b/scraper_service.py
--- a/scripts/test_cycles.py
+++ b/scripts/test_cycles.py
@@ -0,0 +1,199 @@
+import argparse
+import datetime
+import json
+import sys
+import time
+import urllib.parse
+import urllib.request
+
+DEFAULT_STOCKS = ["AAPL", "AMZN", "MSFT", "TSLA"]
+DEFAULT_CYCLES = [None, 5, 10, 25, 50, 75, 100, 150, 200, 500]
+
+
+def http_get(base_url, params, timeout):
+    query = urllib.parse.urlencode(params)
+    url = f"{base_url}?{query}"
+    with urllib.request.urlopen(url, timeout=timeout) as resp:
+        return json.loads(resp.read().decode("utf-8"))
+
+
+def expected_code_from_epoch(epoch):
+    return datetime.datetime.utcfromtimestamp(epoch).strftime("%y%m%d")
+
+
+def all_contracts_match(opts, expected_code):
+    for opt in opts:
+        name = opt.get("Contract Name") or ""
+        if expected_code not in name:
+            return False
+    return True
+
+
+def parse_list(value, default):
+    if not value:
+        return default
+    return [item.strip() for item in value.split(",") if item.strip()]
+
+
+def parse_cycles(value):
+    if not value:
+        return DEFAULT_CYCLES
+    cycles = []
+    for item in value.split(","):
+        token = item.strip().lower()
+        if not token or token in ("default", "none"):
+            cycles.append(None)
+            continue
+        try:
+            cycles.append(int(token))
+        except ValueError:
+            raise ValueError(f"Invalid strikeLimit value: {item}")
+    return cycles
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Yahoo options scraper test cycles")
+    parser.add_argument(
+        "--base-url",
+        default="http://127.0.0.1:9777/scrape_sync",
+        help="Base URL for /scrape_sync",
+    )
+    parser.add_argument(
+        "--stocks",
+        default=",".join(DEFAULT_STOCKS),
+        help="Comma-separated stock symbols",
+    )
+    parser.add_argument(
+        "--strike-limits",
+        default="default,5,10,25,50,75,100,150,200,500",
+        help="Comma-separated strike limits (use 'default' for the API default)",
+    )
+    parser.add_argument(
+        "--baseline-limit",
+        type=int,
+        default=5000,
+        help="Large strikeLimit used to capture all available strikes",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=180,
+        help="Request timeout in seconds",
+    )
+    parser.add_argument(
+        "--sleep",
+        type=float,
+        default=0.2,
+        help="Sleep between requests",
+    )
+    args = parser.parse_args()
+
+    stocks = parse_list(args.stocks, DEFAULT_STOCKS)
+    cycles = parse_cycles(args.strike_limits)
+
+    print("Fetching expiration lists...")
+    expirations = {}
+    for stock in stocks:
+        data = http_get(args.base_url, {"stock": stock, "expiration": "invalid"}, args.timeout)
+        if "available_expirations" not in data:
+            print(f"ERROR: missing available_expirations for {stock}: {data}")
+            sys.exit(1)
+        values = [opt.get("value") for opt in data["available_expirations"] if opt.get("value")]
+        if len(values) < 4:
+            print(f"ERROR: not enough expirations for {stock}: {values}")
+            sys.exit(1)
+        expirations[stock] = values[:4]
+        print(f"  {stock}: {expirations[stock]}")
+        time.sleep(args.sleep)
+
+    print("\nBuilding baseline counts (strikeLimit=%d)..." % args.baseline_limit)
+    baseline_counts = {}
+    for stock, exp_list in expirations.items():
+        for exp in exp_list:
+            data = http_get(
+                args.base_url,
+                {"stock": stock, "expiration": exp, "strikeLimit": args.baseline_limit},
+                args.timeout,
+            )
+            if "error" in data:
+                print(f"ERROR: baseline error for {stock} {exp}: {data}")
+                sys.exit(1)
+            calls_count = data.get("total_calls")
+            puts_count = data.get("total_puts")
+            if calls_count is None or puts_count is None:
+                print(f"ERROR: baseline missing counts for {stock} {exp}: {data}")
+                sys.exit(1)
+            expected_code = expected_code_from_epoch(exp)
+            if not all_contracts_match(data.get("calls", []), expected_code):
+                print(f"ERROR: baseline calls mismatch for {stock} {exp}")
+                sys.exit(1)
+            if not all_contracts_match(data.get("puts", []), expected_code):
+                print(f"ERROR: baseline puts mismatch for {stock} {exp}")
+                sys.exit(1)
+            baseline_counts[(stock, exp)] = (calls_count, puts_count)
+            print(f"  {stock} {exp}: calls={calls_count} puts={puts_count}")
+            time.sleep(args.sleep)
+
+    print("\nRunning %d cycles of API tests..." % len(cycles))
+    for idx, strike_limit in enumerate(cycles, start=1):
+        print(f"Cycle {idx}/{len(cycles)} (strikeLimit={strike_limit})")
+        for stock, exp_list in expirations.items():
+            for exp in exp_list:
+                params = {"stock": stock, "expiration": exp}
+                if strike_limit is not None:
+                    params["strikeLimit"] = strike_limit
+                data = http_get(args.base_url, params, args.timeout)
+                if "error" in data:
+                    print(f"ERROR: {stock} {exp} -> {data}")
+                    sys.exit(1)
+                selected_val = data.get("selected_expiration", {}).get("value")
+                if selected_val != exp:
+                    print(
+                        f"ERROR: selected expiration mismatch for {stock} {exp}: {selected_val}"
+                    )
+                    sys.exit(1)
+                expected_code = expected_code_from_epoch(exp)
+                if not all_contracts_match(data.get("calls", []), expected_code):
+                    print(f"ERROR: calls expiry mismatch for {stock} {exp}")
+                    sys.exit(1)
+                if not all_contracts_match(data.get("puts", []), expected_code):
+                    print(f"ERROR: puts expiry mismatch for {stock} {exp}")
+                    sys.exit(1)
+                available_calls, available_puts = baseline_counts[(stock, exp)]
+                expected_limit = strike_limit if strike_limit is not None else 25
+                expected_calls = min(expected_limit, available_calls)
+                expected_puts = min(expected_limit, available_puts)
+                if data.get("total_calls") != expected_calls:
+                    print(
+                        f"ERROR: call count mismatch for {stock} {exp}: "
+                        f"got {data.get('total_calls')} expected {expected_calls}"
+                    )
+                    sys.exit(1)
+                if data.get("total_puts") != expected_puts:
+                    print(
+                        f"ERROR: put count mismatch for {stock} {exp}: "
+                        f"got {data.get('total_puts')} expected {expected_puts}"
+                    )
+                    sys.exit(1)
+                expected_pruned_calls = max(0, available_calls - expected_calls)
+                expected_pruned_puts = max(0, available_puts - expected_puts)
+                if data.get("pruned_calls_count") != expected_pruned_calls:
+                    print(
+                        f"ERROR: pruned calls mismatch for {stock} {exp}: "
+                        f"got {data.get('pruned_calls_count')} expected {expected_pruned_calls}"
+                    )
+                    sys.exit(1)
+                if data.get("pruned_puts_count") != expected_pruned_puts:
+                    print(
+                        f"ERROR: pruned puts mismatch for {stock} {exp}: "
+                        f"got {data.get('pruned_puts_count')} expected {expected_pruned_puts}"
+                    )
+                    sys.exit(1)
+                time.sleep(args.sleep)
+        print(f"Cycle {idx} OK")
+
+    print("\nAll cycles completed successfully.")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/test_profile_cycles.py
+++ b/scripts/test_profile_cycles.py
@@ -0,0 +1,145 @@
+import argparse
+import json
+import sys
+import time
+import urllib.parse
+import urllib.request
+
+DEFAULT_SYMBOLS = ["AAPL", "AMZN", "MSFT", "TSLA"]
+
+REQUIRED_SECTIONS = [
+    "key_metrics",
+    "valuation",
+    "profitability",
+    "growth",
+    "financial_strength",
+    "cashflow",
+    "ownership",
+    "analyst",
+    "earnings",
+    "performance",
+]
+
+REQUIRED_KEY_METRICS = [
+    "previous_close",
+    "open",
+    "bid",
+    "ask",
+    "beta",
+    "eps_trailing",
+    "dividend_rate",
+    "current_price",
+]
+
+
+def http_get(base_url, params, timeout):
+    query = urllib.parse.urlencode(params)
+    url = f"{base_url}?{query}"
+    with urllib.request.urlopen(url, timeout=timeout) as resp:
+        return json.loads(resp.read().decode("utf-8"))
+
+
+def parse_list(value, default):
+    if not value:
+        return default
+    return [item.strip() for item in value.split(",") if item.strip()]
+
+
+def build_signature(data):
+    return {
+        "key_metrics_keys": sorted(data.get("key_metrics", {}).keys()),
+        "valuation_keys": sorted(data.get("valuation", {}).keys()),
+        "profitability_keys": sorted(data.get("profitability", {}).keys()),
+        "growth_keys": sorted(data.get("growth", {}).keys()),
+        "financial_strength_keys": sorted(data.get("financial_strength", {}).keys()),
+        "cashflow_keys": sorted(data.get("cashflow", {}).keys()),
+        "ownership_keys": sorted(data.get("ownership", {}).keys()),
+        "analyst_keys": sorted(data.get("analyst", {}).keys()),
+        "earnings_keys": sorted(data.get("earnings", {}).keys()),
+        "performance_keys": sorted(data.get("performance", {}).keys()),
+    }
+
+
+def validate_payload(symbol, data):
+    if "error" in data:
+        return f"API error for {symbol}: {data}"
+    if data.get("stock", "").upper() != symbol.upper():
+        return f"Symbol mismatch: expected {symbol} got {data.get('stock')}"
+    validation = data.get("validation", {})
+    if validation.get("symbol_match") is not True:
+        return f"Validation symbol_match failed for {symbol}: {validation}"
+    if validation.get("issues"):
+        return f"Validation issues for {symbol}: {validation}"
+
+    for section in REQUIRED_SECTIONS:
+        if section not in data:
+            return f"Missing section {section} for {symbol}"
+
+    key_metrics = data.get("key_metrics", {})
+    for field in REQUIRED_KEY_METRICS:
+        if field not in key_metrics:
+            return f"Missing key metric {field} for {symbol}"
+
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Yahoo profile scraper test cycles")
+    parser.add_argument(
+        "--base-url",
+        default="http://127.0.0.1:9777/profile",
+        help="Base URL for /profile",
+    )
+    parser.add_argument(
+        "--symbols",
+        default=",".join(DEFAULT_SYMBOLS),
+        help="Comma-separated stock symbols",
+    )
+    parser.add_argument(
+        "--runs",
+        type=int,
+        default=8,
+        help="Number of validation runs per symbol",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=180,
+        help="Request timeout in seconds",
+    )
+    parser.add_argument(
+        "--sleep",
+        type=float,
+        default=0.2,
+        help="Sleep between requests",
+    )
+    args = parser.parse_args()
+
+    symbols = parse_list(args.symbols, DEFAULT_SYMBOLS)
+    signatures = {}
+
+    print(f"Running {args.runs} profile cycles for: {', '.join(symbols)}")
+    for run in range(1, args.runs + 1):
+        print(f"Cycle {run}/{args.runs}")
+        for symbol in symbols:
+            data = http_get(args.base_url, {"stock": symbol}, args.timeout)
+            error = validate_payload(symbol, data)
+            if error:
+                print(f"ERROR: {error}")
+                sys.exit(1)
+            signature = build_signature(data)
+            if symbol not in signatures:
+                signatures[symbol] = signature
+            elif signatures[symbol] != signature:
+                print(f"ERROR: Signature changed for {symbol}")
+                print(f"Baseline: {signatures[symbol]}")
+                print(f"Current:  {signature}")
+                sys.exit(1)
+            time.sleep(args.sleep)
+        print(f"Cycle {run} OK")
+
+    print("\nAll profile cycles completed successfully.")
+
+
+if __name__ == "__main__":
+    main()
Author	SHA1	Message	Date
Rushabh Gosar	83a5e843c0	Feature: Merge Truth Social scraper logic into SimpleScraper	2026-01-09 18:30:39 -08:00
Rushabh Gosar	4e02c6ce0a	Fix: Remove default MSFT symbol to prevent silent data errors	2026-01-09 17:32:10 -08:00
Rushabh Gosar	c01a98abce	Prune profile payload for options thesis	2025-12-29 12:27:30 -08:00
Rushabh Gosar	68805ed80a	Add profile endpoint and validation	2025-12-29 00:45:13 -08:00
Rushabh Gosar	711d87a998	Add GPU-aware launch and testing docs	2025-12-28 12:19:53 -08:00
Rushabh Gosar	bce40014ad	Add strikeLimit parameter and refresh docs	2025-12-28 11:51:06 -08:00
Rushabh Gosar	50a7ef119a	Align Playwright base image version	2025-12-28 10:48:29 -08:00
Rushabh Gosar	4ab0e22047	Install playwright in Docker image	2025-12-28 00:40:56 -08:00
Rushabh Gosar	67b8fad423	Fix expiration-specific options parsing	2025-12-28 00:15:29 -08:00