diff --git a/AGENTS.md b/AGENTS.md index 7bf4b05..136be1a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -22,8 +22,7 @@ - Behavior: - Loads `https://finance.yahoo.com/quote//` with Playwright. - Pulls the embedded SvelteKit payloads (quoteSummary, quote, quoteType, ratings, recommendations). - - Parses rendered DOM for recent news and the ticker news summary. - - Returns company profile, key stats, earnings, analyst data, performance, and news in JSON. + - Returns a pruned JSON with valuation, profitability, growth, financial strength, cashflow, ownership, analyst, earnings, and performance summaries. ## Guard Rails - Run local 10-cycle validation (4 stocks x 4 expiries) before any deploy or push. @@ -45,12 +44,12 @@ - Validate: `python scripts/test_profile_cycles.py --base-url http://127.0.0.1:9777/profile --runs 8` ## Update Log (2025-12-28) -- Added `/profile` endpoint backed by SvelteKit payload parsing (quoteSummary, quote, quoteType, ratings, recommendations) plus rendered news extraction. -- Response now includes company profile, key stats, earnings, analyst trends, performance overview, ticker news summary, and recent news items. -- Validation added to ensure quote data matches the requested symbol, with issues reported in `validation`. +- Added `/profile` endpoint backed by SvelteKit payload parsing (quoteSummary, quote, quoteType, ratings, recommendations). +- `/profile` response trimmed to focus on valuation, profitability, growth, financial strength, cashflow, ownership, analyst, earnings, and performance summaries. +- Validation ensures quote data matches the requested symbol, with issues reported in `validation`. - Issue encountered: existing server instance bound to port 9777 without `/profile`, resolved by restarting the service with the updated script. - Tests executed (local): - - `.\venv\Scripts\python.exe scripts\test_profile_cycles.py --runs 8 --timeout 180` + - `.\venv\Scripts\python.exe scripts/test_profile_cycles.py --runs 8 --timeout 180` - `.\venv\Scripts\python.exe scripts\test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync` - Tests executed (docker): - `docker build -t rushabhtechie/yahoo-scraper:latest .` diff --git a/scraper_service.py b/scraper_service.py index ca84abb..5ac4a75 100644 --- a/scraper_service.py +++ b/scraper_service.py @@ -106,6 +106,14 @@ def extract_raw_value(value): return value +def extract_value(value): + if isinstance(value, dict): + if value.get("raw") is not None: + return value.get("raw") + return value.get("fmt") + return value + + def extract_fmt_value(value): if isinstance(value, dict): return value.get("fmt") @@ -548,6 +556,7 @@ def build_profile_key_metrics(summary_detail, key_stats, financial_data, price_d "dividend_yield": extract_raw_value(summary_detail.get("dividendYield")), "ex_dividend_date": extract_raw_value(summary_detail.get("exDividendDate")), "payout_ratio": extract_raw_value(summary_detail.get("payoutRatio")), + "implied_volatility": extract_raw_value(summary_detail.get("impliedVolatility")), "current_price": pick_value( extract_raw_value(price_data.get("regularMarketPrice")), extract_raw_value(financial_data.get("currentPrice")), @@ -565,6 +574,165 @@ def build_profile_key_metrics(summary_detail, key_stats, financial_data, price_d } +def simplify_recommendation_trend(trend): + simplified = [] + for entry in trend or []: + simplified.append( + { + "period": entry.get("period"), + "strong_buy": entry.get("strongBuy"), + "buy": entry.get("buy"), + "hold": entry.get("hold"), + "sell": entry.get("sell"), + "strong_sell": entry.get("strongSell"), + } + ) + return simplified + + +def simplify_upgrade_history(history, limit=20): + simplified = [] + for entry in history or []: + simplified.append( + { + "firm": entry.get("firm"), + "action": entry.get("action"), + "from_grade": entry.get("fromGrade"), + "to_grade": entry.get("toGrade"), + "date": entry.get("epochGradeDate") or entry.get("gradeDate"), + } + ) + if limit and len(simplified) >= limit: + break + return simplified + + +def simplify_ratings_top(payload): + if not payload: + return None + simplified = {} + for key, value in payload.items(): + if not isinstance(value, dict): + continue + simplified[key] = { + "analyst": value.get("analyst"), + "rating_current": value.get("rating_current"), + "rating_sentiment": value.get("rating_sentiment"), + "pt_current": value.get("pt_current"), + "adjusted_pt_current": value.get("adjusted_pt_current"), + "announcement_date": value.get("announcement_date"), + "datapoints": value.get("datapoints"), + "scores": { + "dir": extract_value(value.get("dir")), + "mm": extract_value(value.get("mm")), + "pt": extract_value(value.get("pt")), + "fin_score": extract_value(value.get("fin_score")), + }, + } + return simplified or None + + +def summarize_performance(perf_data): + if not perf_data: + return {} + overview = perf_data.get("performanceOverview") + if isinstance(overview, dict): + return { + "as_of_date": extract_value(overview.get("asOfDate")), + "returns": { + "five_day": extract_value(overview.get("fiveDaysReturn")), + "one_month": extract_value(overview.get("oneMonthReturn")), + "three_month": extract_value(overview.get("threeMonthReturn")), + "six_month": extract_value(overview.get("sixMonthReturn")), + "ytd": extract_value(overview.get("ytdReturnPct")), + "one_year": extract_value(overview.get("oneYearTotalReturn")), + "two_year": extract_value(overview.get("twoYearTotalReturn")), + "three_year": extract_value(overview.get("threeYearTotalReturn")), + "five_year": extract_value(overview.get("fiveYearTotalReturn")), + "ten_year": extract_value(overview.get("tenYearTotalReturn")), + "max": extract_value(overview.get("maxReturn")), + }, + } + summary = [] + for entry in overview or []: + if not isinstance(entry, dict): + continue + summary.append( + { + "period": entry.get("period"), + "performance": extract_value(entry.get("performance")), + "benchmark": extract_value(entry.get("benchmark")), + } + ) + return {"periods": summary} if summary else {} + + +def summarize_earnings(earnings, calendar_events): + earnings = earnings or {} + calendar_events = calendar_events or {} + earnings_chart = earnings.get("earningsChart", {}) or {} + financials_chart = earnings.get("financialsChart", {}) or {} + calendar_earnings = calendar_events.get("earnings", {}) or {} + + quarterly = [] + for entry in earnings_chart.get("quarterly") or []: + quarterly.append( + { + "quarter": entry.get("date"), + "actual": extract_value(entry.get("actual")), + "estimate": extract_value(entry.get("estimate")), + "surprise": extract_value(entry.get("difference")), + "surprise_percent": extract_value(entry.get("surprisePct")), + } + ) + + yearly = [] + for entry in financials_chart.get("yearly") or []: + yearly.append( + { + "year": entry.get("date"), + "revenue": extract_value(entry.get("revenue")), + "earnings": extract_value(entry.get("earnings")), + } + ) + + quarterly_financials = [] + for entry in financials_chart.get("quarterly") or []: + quarterly_financials.append( + { + "quarter": entry.get("date"), + "revenue": extract_value(entry.get("revenue")), + "earnings": extract_value(entry.get("earnings")), + } + ) + + return { + "next_earnings_dates": [ + extract_value(value) for value in calendar_earnings.get("earningsDate", []) or [] + ], + "is_earnings_date_estimate": calendar_earnings.get("isEarningsDateEstimate"), + "earnings_estimates": { + "average": extract_value(calendar_earnings.get("earningsAverage")), + "low": extract_value(calendar_earnings.get("earningsLow")), + "high": extract_value(calendar_earnings.get("earningsHigh")), + }, + "revenue_estimates": { + "average": extract_value(calendar_earnings.get("revenueAverage")), + "low": extract_value(calendar_earnings.get("revenueLow")), + "high": extract_value(calendar_earnings.get("revenueHigh")), + }, + "quarterly_earnings": quarterly[:4], + "yearly_financials": yearly[:4], + "quarterly_financials": quarterly_financials[:4], + "current_quarter_estimate": extract_value( + earnings_chart.get("currentQuarterEstimate") + ), + "current_quarter_estimate_date": earnings_chart.get("currentQuarterEstimateDate"), + "current_calendar_quarter": earnings_chart.get("currentCalendarQuarter"), + "current_fiscal_quarter": earnings_chart.get("currentFiscalQuarter"), + } + + def scrape_yahoo_profile(symbol): encoded = urllib.parse.quote(symbol, safe="") url = f"https://finance.yahoo.com/quote/{encoded}/" @@ -573,8 +741,6 @@ def scrape_yahoo_profile(symbol): response_html = None rendered_html = None payloads = {} - news = [] - news_summary = None with sync_playwright() as p: launch_args = chromium_launch_args() @@ -602,15 +768,7 @@ def scrape_yahoo_profile(symbol): else: app.logger.warning("No response body for profile page %s", symbol) - try: - page.wait_for_selector( - '[data-testid="recent-news"], [data-testid="ticker-news-summary"]', - timeout=15000, - ) - except Exception as exc: - app.logger.warning("News content not detected for %s: %s", symbol, exc) - - page.wait_for_timeout(2000) + page.wait_for_timeout(1000) rendered_html = page.content() finally: browser.close() @@ -625,11 +783,6 @@ def scrape_yahoo_profile(symbol): fallback_soup = BeautifulSoup(rendered_html, "html.parser") payloads = extract_sveltekit_payloads_from_soup(fallback_soup) - if rendered_html: - news_soup = BeautifulSoup(rendered_html, "html.parser") - news = extract_recent_news_from_soup(news_soup, limit=20) - news_summary = extract_news_summary_from_soup(news_soup) - if not payloads: return { "error": "No embedded payloads found on profile page", @@ -653,7 +806,6 @@ def scrape_yahoo_profile(symbol): quote = extract_quote_response(quote_payload) quote_type = extract_quote_type(quote_type_payload) - summary_profile = quote_summary.get("summaryProfile", {}) if quote_summary else {} summary_detail = quote_summary.get("summaryDetail", {}) if quote_summary else {} key_stats = quote_summary.get("defaultKeyStatistics", {}) if quote_summary else {} financial_data = quote_summary.get("financialData", {}) if quote_summary else {} @@ -665,8 +817,6 @@ def scrape_yahoo_profile(symbol): quote_summary.get("upgradeDowngradeHistory", {}) if quote_summary else {} ) earnings = quote_summary.get("earnings", {}) if quote_summary else {} - earnings_gaap = quote_summary.get("earningsGaap", {}) if quote_summary else {} - earnings_non_gaap = quote_summary.get("earningsNonGaap", {}) if quote_summary else {} calendar_events = quote_summary.get("calendarEvents", {}) if quote_summary else {} equity_performance = ( quote_summary.get("equityPerformance", {}) if quote_summary else {} @@ -681,6 +831,79 @@ def scrape_yahoo_profile(symbol): summary_detail, key_stats, financial_data, price_data, quote ) + valuation = { + "market_cap": extract_raw_value(key_stats.get("marketCap")), + "enterprise_value": extract_raw_value(key_stats.get("enterpriseValue")), + "price_to_book": extract_raw_value(key_stats.get("priceToBook")), + "price_to_sales": extract_raw_value(key_stats.get("priceToSalesTrailing12Months")), + "trailing_pe": key_metrics.get("trailing_pe"), + "forward_pe": key_metrics.get("forward_pe"), + } + + profitability = { + "profit_margins": extract_raw_value(financial_data.get("profitMargins")), + "operating_margins": extract_raw_value(financial_data.get("operatingMargins")), + "gross_margins": extract_raw_value(financial_data.get("grossMargins")), + "ebitda_margins": extract_raw_value(financial_data.get("ebitdaMargins")), + "return_on_assets": extract_raw_value(financial_data.get("returnOnAssets")), + "return_on_equity": extract_raw_value(financial_data.get("returnOnEquity")), + } + + growth = { + "revenue_growth": extract_raw_value(financial_data.get("revenueGrowth")), + "earnings_growth": extract_raw_value(financial_data.get("earningsGrowth")), + "revenue_per_share": extract_raw_value(financial_data.get("revenuePerShare")), + } + + financial_strength = { + "total_cash": extract_raw_value(financial_data.get("totalCash")), + "total_debt": extract_raw_value(financial_data.get("totalDebt")), + "debt_to_equity": extract_raw_value(financial_data.get("debtToEquity")), + "current_ratio": extract_raw_value(financial_data.get("currentRatio")), + "quick_ratio": extract_raw_value(financial_data.get("quickRatio")), + } + + cashflow = { + "operating_cashflow": extract_raw_value(financial_data.get("operatingCashflow")), + "free_cashflow": extract_raw_value(financial_data.get("freeCashflow")), + "ebitda": extract_raw_value(financial_data.get("ebitda")), + } + + ownership = { + "shares_outstanding": extract_raw_value(key_stats.get("sharesOutstanding")), + "float_shares": extract_raw_value(key_stats.get("floatShares")), + "shares_short": extract_raw_value(key_stats.get("sharesShort")), + "short_ratio": extract_raw_value(key_stats.get("shortRatio")), + "short_percent_of_float": extract_raw_value(key_stats.get("shortPercentOfFloat")), + "held_percent_insiders": extract_raw_value(key_stats.get("heldPercentInsiders")), + "held_percent_institutions": extract_raw_value( + key_stats.get("heldPercentInstitutions") + ), + } + + analyst = { + "recommendation": { + "key": key_metrics.get("recommendation_key"), + "mean": key_metrics.get("recommendation_mean"), + "analyst_opinion_count": key_metrics.get("analyst_opinion_count"), + "target_price_high": key_metrics.get("target_price_high"), + "target_price_low": key_metrics.get("target_price_low"), + "target_price_mean": key_metrics.get("target_price_mean"), + "target_price_median": key_metrics.get("target_price_median"), + }, + "trend": simplify_recommendation_trend(recommendation_trend.get("trend")), + "upgrades_downgrades": simplify_upgrade_history( + upgrade_history.get("history"), limit=20 + ), + "ratings_top": simplify_ratings_top(ratings_payload), + } + + earnings_summary = summarize_earnings(earnings, calendar_events) + performance_summary = { + "equity_performance": summarize_performance(equity_performance), + "unadjusted_performance": summarize_performance(performance_overview), + } + matched_symbols = [] for candidate in [ price_data.get("symbol") if price_data else None, @@ -733,26 +956,16 @@ def scrape_yahoo_profile(symbol): "url": url, "fetched_at": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"), "validation": validation, - "company_profile": summary_profile, - "summary_detail": summary_detail, - "default_key_statistics": key_stats, - "financial_data": financial_data, - "price": price_data, - "earnings": earnings, - "earnings_gaap": earnings_gaap, - "earnings_non_gaap": earnings_non_gaap, - "calendar_events": calendar_events, - "equity_performance": equity_performance, - "performance_overview": performance_overview, - "recommendation_trend": recommendation_trend, - "upgrade_downgrade_history": upgrade_history, "key_metrics": key_metrics, - "quote": quote, - "quote_type": quote_type, - "recommendations_by_symbol": recs_payload, - "ratings_top": ratings_payload, - "news_summary": news_summary, - "recent_news": news, + "valuation": valuation, + "profitability": profitability, + "growth": growth, + "financial_strength": financial_strength, + "cashflow": cashflow, + "ownership": ownership, + "analyst": analyst, + "earnings": earnings_summary, + "performance": performance_summary, "data_sources": { "quote_summary": quote_summary_url, "quote": quote_url, diff --git a/scripts/test_profile_cycles.py b/scripts/test_profile_cycles.py index 7c048be..0101503 100644 --- a/scripts/test_profile_cycles.py +++ b/scripts/test_profile_cycles.py @@ -8,24 +8,18 @@ import urllib.request DEFAULT_SYMBOLS = ["AAPL", "AMZN", "MSFT", "TSLA"] REQUIRED_SECTIONS = [ - "company_profile", - "summary_detail", - "default_key_statistics", - "financial_data", - "price", "key_metrics", - "recommendation_trend", - "upgrade_downgrade_history", + "valuation", + "profitability", + "growth", + "financial_strength", + "cashflow", + "ownership", + "analyst", "earnings", - "calendar_events", - "equity_performance", - "performance_overview", - "quote", - "quote_type", - "recent_news", + "performance", ] -REQUIRED_COMPANY_FIELDS = ["longBusinessSummary", "industry", "sector"] REQUIRED_KEY_METRICS = [ "previous_close", "open", @@ -53,15 +47,16 @@ def parse_list(value, default): def build_signature(data): return { - "company_profile_keys": sorted(data.get("company_profile", {}).keys()), - "summary_detail_keys": sorted(data.get("summary_detail", {}).keys()), - "default_key_statistics_keys": sorted( - data.get("default_key_statistics", {}).keys() - ), - "financial_data_keys": sorted(data.get("financial_data", {}).keys()), - "price_keys": sorted(data.get("price", {}).keys()), "key_metrics_keys": sorted(data.get("key_metrics", {}).keys()), - "data_sources_keys": sorted(data.get("data_sources", {}).keys()), + "valuation_keys": sorted(data.get("valuation", {}).keys()), + "profitability_keys": sorted(data.get("profitability", {}).keys()), + "growth_keys": sorted(data.get("growth", {}).keys()), + "financial_strength_keys": sorted(data.get("financial_strength", {}).keys()), + "cashflow_keys": sorted(data.get("cashflow", {}).keys()), + "ownership_keys": sorted(data.get("ownership", {}).keys()), + "analyst_keys": sorted(data.get("analyst", {}).keys()), + "earnings_keys": sorted(data.get("earnings", {}).keys()), + "performance_keys": sorted(data.get("performance", {}).keys()), } @@ -80,19 +75,11 @@ def validate_payload(symbol, data): if section not in data: return f"Missing section {section} for {symbol}" - company_profile = data.get("company_profile", {}) - for field in REQUIRED_COMPANY_FIELDS: - if field not in company_profile: - return f"Missing company field {field} for {symbol}" - key_metrics = data.get("key_metrics", {}) for field in REQUIRED_KEY_METRICS: if field not in key_metrics: return f"Missing key metric {field} for {symbol}" - if not data.get("news_summary") and not data.get("recent_news"): - return f"Missing news summary and recent news for {symbol}" - return None