Add profile endpoint and validation

2025-12-29 00:45:13 -08:00
parent 711d87a998
commit 68805ed80a
3 changed files with 600 additions and 0 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -16,6 +16,14 @@
  - If `strikeLimit` is greater than available strikes, all available rows are returned.
  - `pruned_calls_count` and `pruned_puts_count` report how many rows were removed beyond the limit.
  - `selected_expiration` reports the resolved expiry (epoch + label), and mismatches return an error.
+- Route: `GET /profile`
+- Query params:
+  - `stock`: symbol (default `MSFT`).
+- Behavior:
+  - Loads `https://finance.yahoo.com/quote/<SYMBOL>/` with Playwright.
+  - Pulls the embedded SvelteKit payloads (quoteSummary, quote, quoteType, ratings, recommendations).
+  - Parses rendered DOM for recent news and the ticker news summary.
+  - Returns company profile, key stats, earnings, analyst data, performance, and news in JSON.

 ## Guard Rails
 - Run local 10-cycle validation (4 stocks x 4 expiries) before any deploy or push.
@@ -28,9 +36,26 @@
 - Local server:
  - Start: `.\venv\Scripts\python.exe scraper_service.py`
  - Validate: `python scripts/test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
+- Profile validation (local server):
+  - Validate: `python scripts/test_profile_cycles.py --base-url http://127.0.0.1:9777/profile --runs 8`
 - Docker server:
  - Start: `docker run --rm -p 9777:9777 rushabhtechie/yahoo-scraper:latest`
  - Validate: `python scripts/test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
+- Profile validation (docker server):
+  - Validate: `python scripts/test_profile_cycles.py --base-url http://127.0.0.1:9777/profile --runs 8`
+
+## Update Log (2025-12-28)
+- Added `/profile` endpoint backed by SvelteKit payload parsing (quoteSummary, quote, quoteType, ratings, recommendations) plus rendered news extraction.
+- Response now includes company profile, key stats, earnings, analyst trends, performance overview, ticker news summary, and recent news items.
+- Validation added to ensure quote data matches the requested symbol, with issues reported in `validation`.
+- Issue encountered: existing server instance bound to port 9777 without `/profile`, resolved by restarting the service with the updated script.
+- Tests executed (local):
+  - `.\venv\Scripts\python.exe scripts\test_profile_cycles.py --runs 8 --timeout 180`
+  - `.\venv\Scripts\python.exe scripts\test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
+- Tests executed (docker):
+  - `docker build -t rushabhtechie/yahoo-scraper:latest .`
+  - `.\venv\Scripts\python.exe scripts\test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
+  - `.\venv\Scripts\python.exe scripts\test_profile_cycles.py --base-url http://127.0.0.1:9777/profile --runs 8 --timeout 180`
 - The test harness verifies:
  - Requested expiration matches `selected_expiration.value`.
  - Contract symbols include the expected YYMMDD code.
--- a/scraper_service.py
+++ b/scraper_service.py
@@ -2,6 +2,7 @@ from flask import Flask, jsonify, request
 from playwright.sync_api import sync_playwright
 from bs4 import BeautifulSoup
 from datetime import datetime, timezone
+import html
 import urllib.parse
 import logging
 import json
@@ -353,6 +354,415 @@ def parse_strike_limit(value, default=25):
    return limit if limit > 0 else default


+def parse_sveltekit_payload(raw_text):
+    if not raw_text:
+        return None
+    try:
+        outer = json.loads(raw_text)
+    except json.JSONDecodeError:
+        return None
+    body = outer.get("body")
+    if isinstance(body, str):
+        try:
+            body = json.loads(body)
+        except json.JSONDecodeError:
+            pass
+    return {
+        "status": outer.get("status"),
+        "statusText": outer.get("statusText"),
+        "body": body,
+    }
+
+
+def extract_sveltekit_payloads_from_soup(soup):
+    payloads = {}
+    if soup is None:
+        return payloads
+    scripts = soup.select('script[type="application/json"][data-sveltekit-fetched]')
+    for script in scripts:
+        url = script.get("data-url")
+        if not url:
+            continue
+        url = html.unescape(url)
+        raw_text = script.string or script.get_text()
+        payload = parse_sveltekit_payload(raw_text)
+        if not payload:
+            continue
+        payloads[url] = payload
+    return payloads
+
+
+def select_payload(payloads, needle, symbol=None):
+    if not payloads:
+        return None, None
+    needle = needle.lower()
+    symbol_token = symbol.lower() if symbol else None
+    fallback = None
+    for url, payload in payloads.items():
+        url_lower = url.lower()
+        if needle not in url_lower:
+            continue
+        if symbol_token:
+            if f"/{symbol_token}" in url_lower or f"symbols={symbol_token}" in url_lower:
+                return url, payload.get("body")
+        if fallback is None:
+            fallback = (url, payload.get("body"))
+    return fallback if fallback else (None, None)
+
+
+def extract_quote_summary(payload):
+    if not payload:
+        return None
+    summary = payload.get("quoteSummary")
+    if not summary:
+        return None
+    result = summary.get("result") or []
+    return result[0] if result else None
+
+
+def extract_quote_response(payload):
+    if not payload:
+        return None
+    response = payload.get("quoteResponse")
+    if not response:
+        return None
+    result = response.get("result") or []
+    return result[0] if result else None
+
+
+def extract_quote_type(payload):
+    if not payload:
+        return None
+    quote_type = payload.get("quoteType")
+    if not quote_type:
+        return None
+    result = quote_type.get("result") or []
+    return result[0] if result else None
+
+
+def extract_recent_news_from_soup(soup, limit=20):
+    items = []
+    if soup is None:
+        return items
+    container = soup.select_one('[data-testid="recent-news"]')
+    root = container if container else soup
+    seen = set()
+    for item in root.select('[data-testid="storyitem"]'):
+        title_el = item.select_one("h3")
+        link_el = item.select_one("a[href]")
+        if not title_el and not link_el:
+            continue
+        title = title_el.get_text(strip=True) if title_el else None
+        link = link_el.get("href") if link_el else None
+        publisher = None
+        published = None
+        publishing = item.select_one(".publishing")
+        if publishing:
+            text = " ".join(publishing.stripped_strings)
+            if "\u2022" in text:
+                parts = [part.strip() for part in text.split("\u2022", 1)]
+                publisher = parts[0] or None
+                published = parts[1] if len(parts) > 1 else None
+            else:
+                publisher = text or None
+        key = link or title
+        if key and key in seen:
+            continue
+        if key:
+            seen.add(key)
+        items.append(
+            {
+                "title": title,
+                "publisher": publisher,
+                "published": published,
+                "link": link,
+            }
+        )
+        if limit and len(items) >= limit:
+            break
+    return items
+
+
+def extract_news_summary_from_soup(soup):
+    if soup is None:
+        return None
+    summary = soup.select_one('[data-testid="ticker-news-summary"]')
+    if not summary:
+        return None
+    text = " ".join(summary.stripped_strings)
+    return text if text else None
+
+
+def build_profile_key_metrics(summary_detail, key_stats, financial_data, price_data, quote):
+    summary_detail = summary_detail or {}
+    key_stats = key_stats or {}
+    financial_data = financial_data or {}
+    price_data = price_data or {}
+    quote = quote or {}
+
+    def pick_value(*values):
+        for value in values:
+            if value is not None:
+                return value
+        return None
+
+    return {
+        "previous_close": extract_raw_value(summary_detail.get("previousClose")),
+        "open": extract_raw_value(summary_detail.get("open")),
+        "bid": extract_raw_value(summary_detail.get("bid")),
+        "ask": extract_raw_value(summary_detail.get("ask")),
+        "bid_size": extract_raw_value(summary_detail.get("bidSize")),
+        "ask_size": extract_raw_value(summary_detail.get("askSize")),
+        "day_low": extract_raw_value(summary_detail.get("dayLow")),
+        "day_high": extract_raw_value(summary_detail.get("dayHigh")),
+        "fifty_two_week_low": extract_raw_value(quote.get("fiftyTwoWeekLow")),
+        "fifty_two_week_high": extract_raw_value(quote.get("fiftyTwoWeekHigh")),
+        "volume": pick_value(
+            extract_raw_value(summary_detail.get("volume")),
+            extract_raw_value(price_data.get("regularMarketVolume")),
+            extract_raw_value(quote.get("regularMarketVolume")),
+        ),
+        "average_volume": pick_value(
+            extract_raw_value(summary_detail.get("averageVolume")),
+            extract_raw_value(price_data.get("averageDailyVolume3Month")),
+        ),
+        "market_cap": pick_value(
+            extract_raw_value(summary_detail.get("marketCap")),
+            extract_raw_value(quote.get("marketCap")),
+        ),
+        "beta": pick_value(
+            extract_raw_value(summary_detail.get("beta")),
+            extract_raw_value(key_stats.get("beta")),
+        ),
+        "trailing_pe": pick_value(
+            extract_raw_value(summary_detail.get("trailingPE")),
+            extract_raw_value(key_stats.get("trailingPE")),
+        ),
+        "forward_pe": pick_value(
+            extract_raw_value(summary_detail.get("forwardPE")),
+            extract_raw_value(key_stats.get("forwardPE")),
+        ),
+        "eps_trailing": extract_raw_value(key_stats.get("trailingEps")),
+        "eps_forward": extract_raw_value(key_stats.get("forwardEps")),
+        "dividend_rate": extract_raw_value(summary_detail.get("dividendRate")),
+        "dividend_yield": extract_raw_value(summary_detail.get("dividendYield")),
+        "ex_dividend_date": extract_raw_value(summary_detail.get("exDividendDate")),
+        "payout_ratio": extract_raw_value(summary_detail.get("payoutRatio")),
+        "current_price": pick_value(
+            extract_raw_value(price_data.get("regularMarketPrice")),
+            extract_raw_value(financial_data.get("currentPrice")),
+            extract_raw_value(quote.get("regularMarketPrice")),
+        ),
+        "recommendation_key": financial_data.get("recommendationKey"),
+        "recommendation_mean": extract_raw_value(financial_data.get("recommendationMean")),
+        "target_price_high": extract_raw_value(financial_data.get("targetHighPrice")),
+        "target_price_low": extract_raw_value(financial_data.get("targetLowPrice")),
+        "target_price_mean": extract_raw_value(financial_data.get("targetMeanPrice")),
+        "target_price_median": extract_raw_value(financial_data.get("targetMedianPrice")),
+        "analyst_opinion_count": extract_raw_value(
+            financial_data.get("numberOfAnalystOpinions")
+        ),
+    }
+
+
+def scrape_yahoo_profile(symbol):
+    encoded = urllib.parse.quote(symbol, safe="")
+    url = f"https://finance.yahoo.com/quote/{encoded}/"
+    app.logger.info("Starting profile scrape for symbol=%s url=%s", symbol, url)
+
+    response_html = None
+    rendered_html = None
+    payloads = {}
+    news = []
+    news_summary = None
+
+    with sync_playwright() as p:
+        launch_args = chromium_launch_args()
+        if launch_args:
+            app.logger.info("GPU acceleration enabled")
+        else:
+            app.logger.info("GPU acceleration disabled")
+        browser = p.chromium.launch(headless=True, args=launch_args)
+        page = browser.new_page()
+        page.set_extra_http_headers(
+            {
+                "User-Agent": (
+                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"
+                )
+            }
+        )
+        page.set_default_timeout(60000)
+
+        try:
+            response = page.goto(url, wait_until="domcontentloaded", timeout=60000)
+            app.logger.info("Profile page loaded (domcontentloaded) for %s", symbol)
+            if response:
+                response_html = response.text()
+            else:
+                app.logger.warning("No response body for profile page %s", symbol)
+
+            try:
+                page.wait_for_selector(
+                    '[data-testid="recent-news"], [data-testid="ticker-news-summary"]',
+                    timeout=15000,
+                )
+            except Exception as exc:
+                app.logger.warning("News content not detected for %s: %s", symbol, exc)
+
+            page.wait_for_timeout(2000)
+            rendered_html = page.content()
+        finally:
+            browser.close()
+
+    if not response_html and not rendered_html:
+        return {"error": "Profile page content missing", "stock": symbol, "url": url}
+
+    payload_source = response_html or rendered_html
+    payload_soup = BeautifulSoup(payload_source, "html.parser") if payload_source else None
+    payloads = extract_sveltekit_payloads_from_soup(payload_soup)
+    if not payloads and rendered_html and rendered_html != payload_source:
+        fallback_soup = BeautifulSoup(rendered_html, "html.parser")
+        payloads = extract_sveltekit_payloads_from_soup(fallback_soup)
+
+    if rendered_html:
+        news_soup = BeautifulSoup(rendered_html, "html.parser")
+        news = extract_recent_news_from_soup(news_soup, limit=20)
+        news_summary = extract_news_summary_from_soup(news_soup)
+
+    if not payloads:
+        return {
+            "error": "No embedded payloads found on profile page",
+            "stock": symbol,
+            "url": url,
+        }
+
+    quote_summary_url, quote_summary_payload = select_payload(
+        payloads, "quoteSummary", symbol
+    )
+    quote_url, quote_payload = select_payload(payloads, "v7/finance/quote?", symbol)
+    quote_type_url, quote_type_payload = select_payload(
+        payloads, "/v1/finance/quoteType/", symbol
+    )
+    ratings_url, ratings_payload = select_payload(payloads, "ratings/top", symbol)
+    recs_url, recs_payload = select_payload(
+        payloads, "recommendationsbysymbol", symbol
+    )
+
+    quote_summary = extract_quote_summary(quote_summary_payload)
+    quote = extract_quote_response(quote_payload)
+    quote_type = extract_quote_type(quote_type_payload)
+
+    summary_profile = quote_summary.get("summaryProfile", {}) if quote_summary else {}
+    summary_detail = quote_summary.get("summaryDetail", {}) if quote_summary else {}
+    key_stats = quote_summary.get("defaultKeyStatistics", {}) if quote_summary else {}
+    financial_data = quote_summary.get("financialData", {}) if quote_summary else {}
+    price_data = quote_summary.get("price", {}) if quote_summary else {}
+    recommendation_trend = (
+        quote_summary.get("recommendationTrend", {}) if quote_summary else {}
+    )
+    upgrade_history = (
+        quote_summary.get("upgradeDowngradeHistory", {}) if quote_summary else {}
+    )
+    earnings = quote_summary.get("earnings", {}) if quote_summary else {}
+    earnings_gaap = quote_summary.get("earningsGaap", {}) if quote_summary else {}
+    earnings_non_gaap = quote_summary.get("earningsNonGaap", {}) if quote_summary else {}
+    calendar_events = quote_summary.get("calendarEvents", {}) if quote_summary else {}
+    equity_performance = (
+        quote_summary.get("equityPerformance", {}) if quote_summary else {}
+    )
+    performance_overview = (
+        quote_summary.get("quoteUnadjustedPerformanceOverview", {})
+        if quote_summary
+        else {}
+    )
+
+    key_metrics = build_profile_key_metrics(
+        summary_detail, key_stats, financial_data, price_data, quote
+    )
+
+    matched_symbols = []
+    for candidate in [
+        price_data.get("symbol") if price_data else None,
+        quote.get("symbol") if quote else None,
+        quote_type.get("symbol") if quote_type else None,
+    ]:
+        if candidate:
+            matched_symbols.append(candidate)
+
+    symbol_match = None
+    if matched_symbols:
+        symbol_match = any(
+            candidate.upper() == symbol.upper() for candidate in matched_symbols
+        )
+
+    issues = []
+    if not quote_summary:
+        issues.append("missing_quote_summary")
+    if matched_symbols and not symbol_match:
+        issues.append("symbol_mismatch")
+    if not quote:
+        issues.append("missing_quote_data")
+    if not quote_type:
+        issues.append("missing_quote_type")
+
+    validation = {
+        "requested_symbol": symbol,
+        "matched_symbols": matched_symbols,
+        "symbol_match": symbol_match,
+        "issues": issues,
+    }
+
+    if "missing_quote_summary" in issues or "symbol_mismatch" in issues:
+        return {
+            "error": "Profile validation failed",
+            "stock": symbol,
+            "url": url,
+            "validation": validation,
+            "data_sources": {
+                "quote_summary": quote_summary_url,
+                "quote": quote_url,
+                "quote_type": quote_type_url,
+                "ratings_top": ratings_url,
+                "recommendations": recs_url,
+            },
+        }
+
+    return {
+        "stock": symbol,
+        "url": url,
+        "fetched_at": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "validation": validation,
+        "company_profile": summary_profile,
+        "summary_detail": summary_detail,
+        "default_key_statistics": key_stats,
+        "financial_data": financial_data,
+        "price": price_data,
+        "earnings": earnings,
+        "earnings_gaap": earnings_gaap,
+        "earnings_non_gaap": earnings_non_gaap,
+        "calendar_events": calendar_events,
+        "equity_performance": equity_performance,
+        "performance_overview": performance_overview,
+        "recommendation_trend": recommendation_trend,
+        "upgrade_downgrade_history": upgrade_history,
+        "key_metrics": key_metrics,
+        "quote": quote,
+        "quote_type": quote_type,
+        "recommendations_by_symbol": recs_payload,
+        "ratings_top": ratings_payload,
+        "news_summary": news_summary,
+        "recent_news": news,
+        "data_sources": {
+            "quote_summary": quote_summary_url,
+            "quote": quote_url,
+            "quote_type": quote_type_url,
+            "ratings_top": ratings_url,
+            "recommendations": recs_url,
+        },
+    }
+
+
 def scrape_yahoo_options(symbol, expiration=None, strike_limit=25):
    def parse_table(table_html, side):
        if not table_html:
@@ -672,5 +1082,12 @@ def scrape_sync():
    return jsonify(scrape_yahoo_options(symbol, expiration, strike_limit))


+@app.route("/profile")
+def profile():
+    symbol = request.args.get("stock", "MSFT")
+    app.logger.info("Received /profile request for symbol=%s", symbol)
+    return jsonify(scrape_yahoo_profile(symbol))
+
+
 if __name__ == "__main__":
    app.run(host="0.0.0.0", port=9777)
--- a/scripts/test_profile_cycles.py
+++ b/scripts/test_profile_cycles.py
@@ -0,0 +1,158 @@
+import argparse
+import json
+import sys
+import time
+import urllib.parse
+import urllib.request
+
+DEFAULT_SYMBOLS = ["AAPL", "AMZN", "MSFT", "TSLA"]
+
+REQUIRED_SECTIONS = [
+    "company_profile",
+    "summary_detail",
+    "default_key_statistics",
+    "financial_data",
+    "price",
+    "key_metrics",
+    "recommendation_trend",
+    "upgrade_downgrade_history",
+    "earnings",
+    "calendar_events",
+    "equity_performance",
+    "performance_overview",
+    "quote",
+    "quote_type",
+    "recent_news",
+]
+
+REQUIRED_COMPANY_FIELDS = ["longBusinessSummary", "industry", "sector"]
+REQUIRED_KEY_METRICS = [
+    "previous_close",
+    "open",
+    "bid",
+    "ask",
+    "beta",
+    "eps_trailing",
+    "dividend_rate",
+    "current_price",
+]
+
+
+def http_get(base_url, params, timeout):
+    query = urllib.parse.urlencode(params)
+    url = f"{base_url}?{query}"
+    with urllib.request.urlopen(url, timeout=timeout) as resp:
+        return json.loads(resp.read().decode("utf-8"))
+
+
+def parse_list(value, default):
+    if not value:
+        return default
+    return [item.strip() for item in value.split(",") if item.strip()]
+
+
+def build_signature(data):
+    return {
+        "company_profile_keys": sorted(data.get("company_profile", {}).keys()),
+        "summary_detail_keys": sorted(data.get("summary_detail", {}).keys()),
+        "default_key_statistics_keys": sorted(
+            data.get("default_key_statistics", {}).keys()
+        ),
+        "financial_data_keys": sorted(data.get("financial_data", {}).keys()),
+        "price_keys": sorted(data.get("price", {}).keys()),
+        "key_metrics_keys": sorted(data.get("key_metrics", {}).keys()),
+        "data_sources_keys": sorted(data.get("data_sources", {}).keys()),
+    }
+
+
+def validate_payload(symbol, data):
+    if "error" in data:
+        return f"API error for {symbol}: {data}"
+    if data.get("stock", "").upper() != symbol.upper():
+        return f"Symbol mismatch: expected {symbol} got {data.get('stock')}"
+    validation = data.get("validation", {})
+    if validation.get("symbol_match") is not True:
+        return f"Validation symbol_match failed for {symbol}: {validation}"
+    if validation.get("issues"):
+        return f"Validation issues for {symbol}: {validation}"
+
+    for section in REQUIRED_SECTIONS:
+        if section not in data:
+            return f"Missing section {section} for {symbol}"
+
+    company_profile = data.get("company_profile", {})
+    for field in REQUIRED_COMPANY_FIELDS:
+        if field not in company_profile:
+            return f"Missing company field {field} for {symbol}"
+
+    key_metrics = data.get("key_metrics", {})
+    for field in REQUIRED_KEY_METRICS:
+        if field not in key_metrics:
+            return f"Missing key metric {field} for {symbol}"
+
+    if not data.get("news_summary") and not data.get("recent_news"):
+        return f"Missing news summary and recent news for {symbol}"
+
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Yahoo profile scraper test cycles")
+    parser.add_argument(
+        "--base-url",
+        default="http://127.0.0.1:9777/profile",
+        help="Base URL for /profile",
+    )
+    parser.add_argument(
+        "--symbols",
+        default=",".join(DEFAULT_SYMBOLS),
+        help="Comma-separated stock symbols",
+    )
+    parser.add_argument(
+        "--runs",
+        type=int,
+        default=8,
+        help="Number of validation runs per symbol",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=180,
+        help="Request timeout in seconds",
+    )
+    parser.add_argument(
+        "--sleep",
+        type=float,
+        default=0.2,
+        help="Sleep between requests",
+    )
+    args = parser.parse_args()
+
+    symbols = parse_list(args.symbols, DEFAULT_SYMBOLS)
+    signatures = {}
+
+    print(f"Running {args.runs} profile cycles for: {', '.join(symbols)}")
+    for run in range(1, args.runs + 1):
+        print(f"Cycle {run}/{args.runs}")
+        for symbol in symbols:
+            data = http_get(args.base_url, {"stock": symbol}, args.timeout)
+            error = validate_payload(symbol, data)
+            if error:
+                print(f"ERROR: {error}")
+                sys.exit(1)
+            signature = build_signature(data)
+            if symbol not in signatures:
+                signatures[symbol] = signature
+            elif signatures[symbol] != signature:
+                print(f"ERROR: Signature changed for {symbol}")
+                print(f"Baseline: {signatures[symbol]}")
+                print(f"Current:  {signature}")
+                sys.exit(1)
+            time.sleep(args.sleep)
+        print(f"Cycle {run} OK")
+
+    print("\nAll profile cycles completed successfully.")
+
+
+if __name__ == "__main__":
+    main()