diff --git a/AGENTS.md b/AGENTS.md index 1dfcf9b..7bf4b05 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -16,6 +16,14 @@ - If `strikeLimit` is greater than available strikes, all available rows are returned. - `pruned_calls_count` and `pruned_puts_count` report how many rows were removed beyond the limit. - `selected_expiration` reports the resolved expiry (epoch + label), and mismatches return an error. +- Route: `GET /profile` +- Query params: + - `stock`: symbol (default `MSFT`). +- Behavior: + - Loads `https://finance.yahoo.com/quote//` with Playwright. + - Pulls the embedded SvelteKit payloads (quoteSummary, quote, quoteType, ratings, recommendations). + - Parses rendered DOM for recent news and the ticker news summary. + - Returns company profile, key stats, earnings, analyst data, performance, and news in JSON. ## Guard Rails - Run local 10-cycle validation (4 stocks x 4 expiries) before any deploy or push. @@ -28,9 +36,26 @@ - Local server: - Start: `.\venv\Scripts\python.exe scraper_service.py` - Validate: `python scripts/test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync` +- Profile validation (local server): + - Validate: `python scripts/test_profile_cycles.py --base-url http://127.0.0.1:9777/profile --runs 8` - Docker server: - Start: `docker run --rm -p 9777:9777 rushabhtechie/yahoo-scraper:latest` - Validate: `python scripts/test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync` +- Profile validation (docker server): + - Validate: `python scripts/test_profile_cycles.py --base-url http://127.0.0.1:9777/profile --runs 8` + +## Update Log (2025-12-28) +- Added `/profile` endpoint backed by SvelteKit payload parsing (quoteSummary, quote, quoteType, ratings, recommendations) plus rendered news extraction. +- Response now includes company profile, key stats, earnings, analyst trends, performance overview, ticker news summary, and recent news items. +- Validation added to ensure quote data matches the requested symbol, with issues reported in `validation`. +- Issue encountered: existing server instance bound to port 9777 without `/profile`, resolved by restarting the service with the updated script. +- Tests executed (local): + - `.\venv\Scripts\python.exe scripts\test_profile_cycles.py --runs 8 --timeout 180` + - `.\venv\Scripts\python.exe scripts\test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync` +- Tests executed (docker): + - `docker build -t rushabhtechie/yahoo-scraper:latest .` + - `.\venv\Scripts\python.exe scripts\test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync` + - `.\venv\Scripts\python.exe scripts\test_profile_cycles.py --base-url http://127.0.0.1:9777/profile --runs 8 --timeout 180` - The test harness verifies: - Requested expiration matches `selected_expiration.value`. - Contract symbols include the expected YYMMDD code. diff --git a/scraper_service.py b/scraper_service.py index 87e4d25..ca84abb 100644 --- a/scraper_service.py +++ b/scraper_service.py @@ -2,6 +2,7 @@ from flask import Flask, jsonify, request from playwright.sync_api import sync_playwright from bs4 import BeautifulSoup from datetime import datetime, timezone +import html import urllib.parse import logging import json @@ -353,6 +354,415 @@ def parse_strike_limit(value, default=25): return limit if limit > 0 else default +def parse_sveltekit_payload(raw_text): + if not raw_text: + return None + try: + outer = json.loads(raw_text) + except json.JSONDecodeError: + return None + body = outer.get("body") + if isinstance(body, str): + try: + body = json.loads(body) + except json.JSONDecodeError: + pass + return { + "status": outer.get("status"), + "statusText": outer.get("statusText"), + "body": body, + } + + +def extract_sveltekit_payloads_from_soup(soup): + payloads = {} + if soup is None: + return payloads + scripts = soup.select('script[type="application/json"][data-sveltekit-fetched]') + for script in scripts: + url = script.get("data-url") + if not url: + continue + url = html.unescape(url) + raw_text = script.string or script.get_text() + payload = parse_sveltekit_payload(raw_text) + if not payload: + continue + payloads[url] = payload + return payloads + + +def select_payload(payloads, needle, symbol=None): + if not payloads: + return None, None + needle = needle.lower() + symbol_token = symbol.lower() if symbol else None + fallback = None + for url, payload in payloads.items(): + url_lower = url.lower() + if needle not in url_lower: + continue + if symbol_token: + if f"/{symbol_token}" in url_lower or f"symbols={symbol_token}" in url_lower: + return url, payload.get("body") + if fallback is None: + fallback = (url, payload.get("body")) + return fallback if fallback else (None, None) + + +def extract_quote_summary(payload): + if not payload: + return None + summary = payload.get("quoteSummary") + if not summary: + return None + result = summary.get("result") or [] + return result[0] if result else None + + +def extract_quote_response(payload): + if not payload: + return None + response = payload.get("quoteResponse") + if not response: + return None + result = response.get("result") or [] + return result[0] if result else None + + +def extract_quote_type(payload): + if not payload: + return None + quote_type = payload.get("quoteType") + if not quote_type: + return None + result = quote_type.get("result") or [] + return result[0] if result else None + + +def extract_recent_news_from_soup(soup, limit=20): + items = [] + if soup is None: + return items + container = soup.select_one('[data-testid="recent-news"]') + root = container if container else soup + seen = set() + for item in root.select('[data-testid="storyitem"]'): + title_el = item.select_one("h3") + link_el = item.select_one("a[href]") + if not title_el and not link_el: + continue + title = title_el.get_text(strip=True) if title_el else None + link = link_el.get("href") if link_el else None + publisher = None + published = None + publishing = item.select_one(".publishing") + if publishing: + text = " ".join(publishing.stripped_strings) + if "\u2022" in text: + parts = [part.strip() for part in text.split("\u2022", 1)] + publisher = parts[0] or None + published = parts[1] if len(parts) > 1 else None + else: + publisher = text or None + key = link or title + if key and key in seen: + continue + if key: + seen.add(key) + items.append( + { + "title": title, + "publisher": publisher, + "published": published, + "link": link, + } + ) + if limit and len(items) >= limit: + break + return items + + +def extract_news_summary_from_soup(soup): + if soup is None: + return None + summary = soup.select_one('[data-testid="ticker-news-summary"]') + if not summary: + return None + text = " ".join(summary.stripped_strings) + return text if text else None + + +def build_profile_key_metrics(summary_detail, key_stats, financial_data, price_data, quote): + summary_detail = summary_detail or {} + key_stats = key_stats or {} + financial_data = financial_data or {} + price_data = price_data or {} + quote = quote or {} + + def pick_value(*values): + for value in values: + if value is not None: + return value + return None + + return { + "previous_close": extract_raw_value(summary_detail.get("previousClose")), + "open": extract_raw_value(summary_detail.get("open")), + "bid": extract_raw_value(summary_detail.get("bid")), + "ask": extract_raw_value(summary_detail.get("ask")), + "bid_size": extract_raw_value(summary_detail.get("bidSize")), + "ask_size": extract_raw_value(summary_detail.get("askSize")), + "day_low": extract_raw_value(summary_detail.get("dayLow")), + "day_high": extract_raw_value(summary_detail.get("dayHigh")), + "fifty_two_week_low": extract_raw_value(quote.get("fiftyTwoWeekLow")), + "fifty_two_week_high": extract_raw_value(quote.get("fiftyTwoWeekHigh")), + "volume": pick_value( + extract_raw_value(summary_detail.get("volume")), + extract_raw_value(price_data.get("regularMarketVolume")), + extract_raw_value(quote.get("regularMarketVolume")), + ), + "average_volume": pick_value( + extract_raw_value(summary_detail.get("averageVolume")), + extract_raw_value(price_data.get("averageDailyVolume3Month")), + ), + "market_cap": pick_value( + extract_raw_value(summary_detail.get("marketCap")), + extract_raw_value(quote.get("marketCap")), + ), + "beta": pick_value( + extract_raw_value(summary_detail.get("beta")), + extract_raw_value(key_stats.get("beta")), + ), + "trailing_pe": pick_value( + extract_raw_value(summary_detail.get("trailingPE")), + extract_raw_value(key_stats.get("trailingPE")), + ), + "forward_pe": pick_value( + extract_raw_value(summary_detail.get("forwardPE")), + extract_raw_value(key_stats.get("forwardPE")), + ), + "eps_trailing": extract_raw_value(key_stats.get("trailingEps")), + "eps_forward": extract_raw_value(key_stats.get("forwardEps")), + "dividend_rate": extract_raw_value(summary_detail.get("dividendRate")), + "dividend_yield": extract_raw_value(summary_detail.get("dividendYield")), + "ex_dividend_date": extract_raw_value(summary_detail.get("exDividendDate")), + "payout_ratio": extract_raw_value(summary_detail.get("payoutRatio")), + "current_price": pick_value( + extract_raw_value(price_data.get("regularMarketPrice")), + extract_raw_value(financial_data.get("currentPrice")), + extract_raw_value(quote.get("regularMarketPrice")), + ), + "recommendation_key": financial_data.get("recommendationKey"), + "recommendation_mean": extract_raw_value(financial_data.get("recommendationMean")), + "target_price_high": extract_raw_value(financial_data.get("targetHighPrice")), + "target_price_low": extract_raw_value(financial_data.get("targetLowPrice")), + "target_price_mean": extract_raw_value(financial_data.get("targetMeanPrice")), + "target_price_median": extract_raw_value(financial_data.get("targetMedianPrice")), + "analyst_opinion_count": extract_raw_value( + financial_data.get("numberOfAnalystOpinions") + ), + } + + +def scrape_yahoo_profile(symbol): + encoded = urllib.parse.quote(symbol, safe="") + url = f"https://finance.yahoo.com/quote/{encoded}/" + app.logger.info("Starting profile scrape for symbol=%s url=%s", symbol, url) + + response_html = None + rendered_html = None + payloads = {} + news = [] + news_summary = None + + with sync_playwright() as p: + launch_args = chromium_launch_args() + if launch_args: + app.logger.info("GPU acceleration enabled") + else: + app.logger.info("GPU acceleration disabled") + browser = p.chromium.launch(headless=True, args=launch_args) + page = browser.new_page() + page.set_extra_http_headers( + { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36" + ) + } + ) + page.set_default_timeout(60000) + + try: + response = page.goto(url, wait_until="domcontentloaded", timeout=60000) + app.logger.info("Profile page loaded (domcontentloaded) for %s", symbol) + if response: + response_html = response.text() + else: + app.logger.warning("No response body for profile page %s", symbol) + + try: + page.wait_for_selector( + '[data-testid="recent-news"], [data-testid="ticker-news-summary"]', + timeout=15000, + ) + except Exception as exc: + app.logger.warning("News content not detected for %s: %s", symbol, exc) + + page.wait_for_timeout(2000) + rendered_html = page.content() + finally: + browser.close() + + if not response_html and not rendered_html: + return {"error": "Profile page content missing", "stock": symbol, "url": url} + + payload_source = response_html or rendered_html + payload_soup = BeautifulSoup(payload_source, "html.parser") if payload_source else None + payloads = extract_sveltekit_payloads_from_soup(payload_soup) + if not payloads and rendered_html and rendered_html != payload_source: + fallback_soup = BeautifulSoup(rendered_html, "html.parser") + payloads = extract_sveltekit_payloads_from_soup(fallback_soup) + + if rendered_html: + news_soup = BeautifulSoup(rendered_html, "html.parser") + news = extract_recent_news_from_soup(news_soup, limit=20) + news_summary = extract_news_summary_from_soup(news_soup) + + if not payloads: + return { + "error": "No embedded payloads found on profile page", + "stock": symbol, + "url": url, + } + + quote_summary_url, quote_summary_payload = select_payload( + payloads, "quoteSummary", symbol + ) + quote_url, quote_payload = select_payload(payloads, "v7/finance/quote?", symbol) + quote_type_url, quote_type_payload = select_payload( + payloads, "/v1/finance/quoteType/", symbol + ) + ratings_url, ratings_payload = select_payload(payloads, "ratings/top", symbol) + recs_url, recs_payload = select_payload( + payloads, "recommendationsbysymbol", symbol + ) + + quote_summary = extract_quote_summary(quote_summary_payload) + quote = extract_quote_response(quote_payload) + quote_type = extract_quote_type(quote_type_payload) + + summary_profile = quote_summary.get("summaryProfile", {}) if quote_summary else {} + summary_detail = quote_summary.get("summaryDetail", {}) if quote_summary else {} + key_stats = quote_summary.get("defaultKeyStatistics", {}) if quote_summary else {} + financial_data = quote_summary.get("financialData", {}) if quote_summary else {} + price_data = quote_summary.get("price", {}) if quote_summary else {} + recommendation_trend = ( + quote_summary.get("recommendationTrend", {}) if quote_summary else {} + ) + upgrade_history = ( + quote_summary.get("upgradeDowngradeHistory", {}) if quote_summary else {} + ) + earnings = quote_summary.get("earnings", {}) if quote_summary else {} + earnings_gaap = quote_summary.get("earningsGaap", {}) if quote_summary else {} + earnings_non_gaap = quote_summary.get("earningsNonGaap", {}) if quote_summary else {} + calendar_events = quote_summary.get("calendarEvents", {}) if quote_summary else {} + equity_performance = ( + quote_summary.get("equityPerformance", {}) if quote_summary else {} + ) + performance_overview = ( + quote_summary.get("quoteUnadjustedPerformanceOverview", {}) + if quote_summary + else {} + ) + + key_metrics = build_profile_key_metrics( + summary_detail, key_stats, financial_data, price_data, quote + ) + + matched_symbols = [] + for candidate in [ + price_data.get("symbol") if price_data else None, + quote.get("symbol") if quote else None, + quote_type.get("symbol") if quote_type else None, + ]: + if candidate: + matched_symbols.append(candidate) + + symbol_match = None + if matched_symbols: + symbol_match = any( + candidate.upper() == symbol.upper() for candidate in matched_symbols + ) + + issues = [] + if not quote_summary: + issues.append("missing_quote_summary") + if matched_symbols and not symbol_match: + issues.append("symbol_mismatch") + if not quote: + issues.append("missing_quote_data") + if not quote_type: + issues.append("missing_quote_type") + + validation = { + "requested_symbol": symbol, + "matched_symbols": matched_symbols, + "symbol_match": symbol_match, + "issues": issues, + } + + if "missing_quote_summary" in issues or "symbol_mismatch" in issues: + return { + "error": "Profile validation failed", + "stock": symbol, + "url": url, + "validation": validation, + "data_sources": { + "quote_summary": quote_summary_url, + "quote": quote_url, + "quote_type": quote_type_url, + "ratings_top": ratings_url, + "recommendations": recs_url, + }, + } + + return { + "stock": symbol, + "url": url, + "fetched_at": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"), + "validation": validation, + "company_profile": summary_profile, + "summary_detail": summary_detail, + "default_key_statistics": key_stats, + "financial_data": financial_data, + "price": price_data, + "earnings": earnings, + "earnings_gaap": earnings_gaap, + "earnings_non_gaap": earnings_non_gaap, + "calendar_events": calendar_events, + "equity_performance": equity_performance, + "performance_overview": performance_overview, + "recommendation_trend": recommendation_trend, + "upgrade_downgrade_history": upgrade_history, + "key_metrics": key_metrics, + "quote": quote, + "quote_type": quote_type, + "recommendations_by_symbol": recs_payload, + "ratings_top": ratings_payload, + "news_summary": news_summary, + "recent_news": news, + "data_sources": { + "quote_summary": quote_summary_url, + "quote": quote_url, + "quote_type": quote_type_url, + "ratings_top": ratings_url, + "recommendations": recs_url, + }, + } + + def scrape_yahoo_options(symbol, expiration=None, strike_limit=25): def parse_table(table_html, side): if not table_html: @@ -672,5 +1082,12 @@ def scrape_sync(): return jsonify(scrape_yahoo_options(symbol, expiration, strike_limit)) +@app.route("/profile") +def profile(): + symbol = request.args.get("stock", "MSFT") + app.logger.info("Received /profile request for symbol=%s", symbol) + return jsonify(scrape_yahoo_profile(symbol)) + + if __name__ == "__main__": app.run(host="0.0.0.0", port=9777) diff --git a/scripts/test_profile_cycles.py b/scripts/test_profile_cycles.py new file mode 100644 index 0000000..7c048be --- /dev/null +++ b/scripts/test_profile_cycles.py @@ -0,0 +1,158 @@ +import argparse +import json +import sys +import time +import urllib.parse +import urllib.request + +DEFAULT_SYMBOLS = ["AAPL", "AMZN", "MSFT", "TSLA"] + +REQUIRED_SECTIONS = [ + "company_profile", + "summary_detail", + "default_key_statistics", + "financial_data", + "price", + "key_metrics", + "recommendation_trend", + "upgrade_downgrade_history", + "earnings", + "calendar_events", + "equity_performance", + "performance_overview", + "quote", + "quote_type", + "recent_news", +] + +REQUIRED_COMPANY_FIELDS = ["longBusinessSummary", "industry", "sector"] +REQUIRED_KEY_METRICS = [ + "previous_close", + "open", + "bid", + "ask", + "beta", + "eps_trailing", + "dividend_rate", + "current_price", +] + + +def http_get(base_url, params, timeout): + query = urllib.parse.urlencode(params) + url = f"{base_url}?{query}" + with urllib.request.urlopen(url, timeout=timeout) as resp: + return json.loads(resp.read().decode("utf-8")) + + +def parse_list(value, default): + if not value: + return default + return [item.strip() for item in value.split(",") if item.strip()] + + +def build_signature(data): + return { + "company_profile_keys": sorted(data.get("company_profile", {}).keys()), + "summary_detail_keys": sorted(data.get("summary_detail", {}).keys()), + "default_key_statistics_keys": sorted( + data.get("default_key_statistics", {}).keys() + ), + "financial_data_keys": sorted(data.get("financial_data", {}).keys()), + "price_keys": sorted(data.get("price", {}).keys()), + "key_metrics_keys": sorted(data.get("key_metrics", {}).keys()), + "data_sources_keys": sorted(data.get("data_sources", {}).keys()), + } + + +def validate_payload(symbol, data): + if "error" in data: + return f"API error for {symbol}: {data}" + if data.get("stock", "").upper() != symbol.upper(): + return f"Symbol mismatch: expected {symbol} got {data.get('stock')}" + validation = data.get("validation", {}) + if validation.get("symbol_match") is not True: + return f"Validation symbol_match failed for {symbol}: {validation}" + if validation.get("issues"): + return f"Validation issues for {symbol}: {validation}" + + for section in REQUIRED_SECTIONS: + if section not in data: + return f"Missing section {section} for {symbol}" + + company_profile = data.get("company_profile", {}) + for field in REQUIRED_COMPANY_FIELDS: + if field not in company_profile: + return f"Missing company field {field} for {symbol}" + + key_metrics = data.get("key_metrics", {}) + for field in REQUIRED_KEY_METRICS: + if field not in key_metrics: + return f"Missing key metric {field} for {symbol}" + + if not data.get("news_summary") and not data.get("recent_news"): + return f"Missing news summary and recent news for {symbol}" + + return None + + +def main(): + parser = argparse.ArgumentParser(description="Yahoo profile scraper test cycles") + parser.add_argument( + "--base-url", + default="http://127.0.0.1:9777/profile", + help="Base URL for /profile", + ) + parser.add_argument( + "--symbols", + default=",".join(DEFAULT_SYMBOLS), + help="Comma-separated stock symbols", + ) + parser.add_argument( + "--runs", + type=int, + default=8, + help="Number of validation runs per symbol", + ) + parser.add_argument( + "--timeout", + type=int, + default=180, + help="Request timeout in seconds", + ) + parser.add_argument( + "--sleep", + type=float, + default=0.2, + help="Sleep between requests", + ) + args = parser.parse_args() + + symbols = parse_list(args.symbols, DEFAULT_SYMBOLS) + signatures = {} + + print(f"Running {args.runs} profile cycles for: {', '.join(symbols)}") + for run in range(1, args.runs + 1): + print(f"Cycle {run}/{args.runs}") + for symbol in symbols: + data = http_get(args.base_url, {"stock": symbol}, args.timeout) + error = validate_payload(symbol, data) + if error: + print(f"ERROR: {error}") + sys.exit(1) + signature = build_signature(data) + if symbol not in signatures: + signatures[symbol] = signature + elif signatures[symbol] != signature: + print(f"ERROR: Signature changed for {symbol}") + print(f"Baseline: {signatures[symbol]}") + print(f"Current: {signature}") + sys.exit(1) + time.sleep(args.sleep) + print(f"Cycle {run} OK") + + print("\nAll profile cycles completed successfully.") + + +if __name__ == "__main__": + main()