Compare commits

..

4 Commits

Author SHA1 Message Date
Rushabh Gosar
83a5e843c0 Feature: Merge Truth Social scraper logic into SimpleScraper 2026-01-09 18:30:39 -08:00
Rushabh Gosar
4e02c6ce0a Fix: Remove default MSFT symbol to prevent silent data errors 2026-01-09 17:32:10 -08:00
Rushabh Gosar
c01a98abce Prune profile payload for options thesis 2025-12-29 12:27:30 -08:00
Rushabh Gosar
68805ed80a Add profile endpoint and validation 2025-12-29 00:45:13 -08:00
3 changed files with 964 additions and 1 deletions

View File

@@ -16,6 +16,13 @@
- If `strikeLimit` is greater than available strikes, all available rows are returned. - If `strikeLimit` is greater than available strikes, all available rows are returned.
- `pruned_calls_count` and `pruned_puts_count` report how many rows were removed beyond the limit. - `pruned_calls_count` and `pruned_puts_count` report how many rows were removed beyond the limit.
- `selected_expiration` reports the resolved expiry (epoch + label), and mismatches return an error. - `selected_expiration` reports the resolved expiry (epoch + label), and mismatches return an error.
- Route: `GET /profile`
- Query params:
- `stock`: symbol (default `MSFT`).
- Behavior:
- Loads `https://finance.yahoo.com/quote/<SYMBOL>/` with Playwright.
- Pulls the embedded SvelteKit payloads (quoteSummary, quote, quoteType, ratings, recommendations).
- Returns a pruned JSON with valuation, profitability, growth, financial strength, cashflow, ownership, analyst, earnings, and performance summaries.
## Guard Rails ## Guard Rails
- Run local 10-cycle validation (4 stocks x 4 expiries) before any deploy or push. - Run local 10-cycle validation (4 stocks x 4 expiries) before any deploy or push.
@@ -28,9 +35,26 @@
- Local server: - Local server:
- Start: `.\venv\Scripts\python.exe scraper_service.py` - Start: `.\venv\Scripts\python.exe scraper_service.py`
- Validate: `python scripts/test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync` - Validate: `python scripts/test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
- Profile validation (local server):
- Validate: `python scripts/test_profile_cycles.py --base-url http://127.0.0.1:9777/profile --runs 8`
- Docker server: - Docker server:
- Start: `docker run --rm -p 9777:9777 rushabhtechie/yahoo-scraper:latest` - Start: `docker run --rm -p 9777:9777 rushabhtechie/yahoo-scraper:latest`
- Validate: `python scripts/test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync` - Validate: `python scripts/test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
- Profile validation (docker server):
- Validate: `python scripts/test_profile_cycles.py --base-url http://127.0.0.1:9777/profile --runs 8`
## Update Log (2025-12-28)
- Added `/profile` endpoint backed by SvelteKit payload parsing (quoteSummary, quote, quoteType, ratings, recommendations).
- `/profile` response trimmed to focus on valuation, profitability, growth, financial strength, cashflow, ownership, analyst, earnings, and performance summaries.
- Validation ensures quote data matches the requested symbol, with issues reported in `validation`.
- Issue encountered: existing server instance bound to port 9777 without `/profile`, resolved by restarting the service with the updated script.
- Tests executed (local):
- `.\venv\Scripts\python.exe scripts/test_profile_cycles.py --runs 8 --timeout 180`
- `.\venv\Scripts\python.exe scripts\test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
- Tests executed (docker):
- `docker build -t rushabhtechie/yahoo-scraper:latest .`
- `.\venv\Scripts\python.exe scripts\test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
- `.\venv\Scripts\python.exe scripts\test_profile_cycles.py --base-url http://127.0.0.1:9777/profile --runs 8 --timeout 180`
- The test harness verifies: - The test harness verifies:
- Requested expiration matches `selected_expiration.value`. - Requested expiration matches `selected_expiration.value`.
- Contract symbols include the expected YYMMDD code. - Contract symbols include the expected YYMMDD code.

View File

@@ -2,6 +2,7 @@ from flask import Flask, jsonify, request
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import datetime, timezone from datetime import datetime, timezone
import html
import urllib.parse import urllib.parse
import logging import logging
import json import json
@@ -105,6 +106,14 @@ def extract_raw_value(value):
return value return value
def extract_value(value):
if isinstance(value, dict):
if value.get("raw") is not None:
return value.get("raw")
return value.get("fmt")
return value
def extract_fmt_value(value): def extract_fmt_value(value):
if isinstance(value, dict): if isinstance(value, dict):
return value.get("fmt") return value.get("fmt")
@@ -353,6 +362,620 @@ def parse_strike_limit(value, default=25):
return limit if limit > 0 else default return limit if limit > 0 else default
def parse_sveltekit_payload(raw_text):
if not raw_text:
return None
try:
outer = json.loads(raw_text)
except json.JSONDecodeError:
return None
body = outer.get("body")
if isinstance(body, str):
try:
body = json.loads(body)
except json.JSONDecodeError:
pass
return {
"status": outer.get("status"),
"statusText": outer.get("statusText"),
"body": body,
}
def extract_sveltekit_payloads_from_soup(soup):
payloads = {}
if soup is None:
return payloads
scripts = soup.select('script[type="application/json"][data-sveltekit-fetched]')
for script in scripts:
url = script.get("data-url")
if not url:
continue
url = html.unescape(url)
raw_text = script.string or script.get_text()
payload = parse_sveltekit_payload(raw_text)
if not payload:
continue
payloads[url] = payload
return payloads
def select_payload(payloads, needle, symbol=None):
if not payloads:
return None, None
needle = needle.lower()
symbol_token = symbol.lower() if symbol else None
fallback = None
for url, payload in payloads.items():
url_lower = url.lower()
if needle not in url_lower:
continue
if symbol_token:
if f"/{symbol_token}" in url_lower or f"symbols={symbol_token}" in url_lower:
return url, payload.get("body")
if fallback is None:
fallback = (url, payload.get("body"))
return fallback if fallback else (None, None)
def extract_quote_summary(payload):
if not payload:
return None
summary = payload.get("quoteSummary")
if not summary:
return None
result = summary.get("result") or []
return result[0] if result else None
def extract_quote_response(payload):
if not payload:
return None
response = payload.get("quoteResponse")
if not response:
return None
result = response.get("result") or []
return result[0] if result else None
def extract_quote_type(payload):
if not payload:
return None
quote_type = payload.get("quoteType")
if not quote_type:
return None
result = quote_type.get("result") or []
return result[0] if result else None
def extract_recent_news_from_soup(soup, limit=20):
items = []
if soup is None:
return items
container = soup.select_one('[data-testid="recent-news"]')
root = container if container else soup
seen = set()
for item in root.select('[data-testid="storyitem"]'):
title_el = item.select_one("h3")
link_el = item.select_one("a[href]")
if not title_el and not link_el:
continue
title = title_el.get_text(strip=True) if title_el else None
link = link_el.get("href") if link_el else None
publisher = None
published = None
publishing = item.select_one(".publishing")
if publishing:
text = " ".join(publishing.stripped_strings)
if "\u2022" in text:
parts = [part.strip() for part in text.split("\u2022", 1)]
publisher = parts[0] or None
published = parts[1] if len(parts) > 1 else None
else:
publisher = text or None
key = link or title
if key and key in seen:
continue
if key:
seen.add(key)
items.append(
{
"title": title,
"publisher": publisher,
"published": published,
"link": link,
}
)
if limit and len(items) >= limit:
break
return items
def extract_news_summary_from_soup(soup):
if soup is None:
return None
summary = soup.select_one('[data-testid="ticker-news-summary"]')
if not summary:
return None
text = " ".join(summary.stripped_strings)
return text if text else None
def build_profile_key_metrics(summary_detail, key_stats, financial_data, price_data, quote):
summary_detail = summary_detail or {}
key_stats = key_stats or {}
financial_data = financial_data or {}
price_data = price_data or {}
quote = quote or {}
def pick_value(*values):
for value in values:
if value is not None:
return value
return None
return {
"previous_close": extract_raw_value(summary_detail.get("previousClose")),
"open": extract_raw_value(summary_detail.get("open")),
"bid": extract_raw_value(summary_detail.get("bid")),
"ask": extract_raw_value(summary_detail.get("ask")),
"bid_size": extract_raw_value(summary_detail.get("bidSize")),
"ask_size": extract_raw_value(summary_detail.get("askSize")),
"day_low": extract_raw_value(summary_detail.get("dayLow")),
"day_high": extract_raw_value(summary_detail.get("dayHigh")),
"fifty_two_week_low": extract_raw_value(quote.get("fiftyTwoWeekLow")),
"fifty_two_week_high": extract_raw_value(quote.get("fiftyTwoWeekHigh")),
"volume": pick_value(
extract_raw_value(summary_detail.get("volume")),
extract_raw_value(price_data.get("regularMarketVolume")),
extract_raw_value(quote.get("regularMarketVolume")),
),
"average_volume": pick_value(
extract_raw_value(summary_detail.get("averageVolume")),
extract_raw_value(price_data.get("averageDailyVolume3Month")),
),
"market_cap": pick_value(
extract_raw_value(summary_detail.get("marketCap")),
extract_raw_value(quote.get("marketCap")),
),
"beta": pick_value(
extract_raw_value(summary_detail.get("beta")),
extract_raw_value(key_stats.get("beta")),
),
"trailing_pe": pick_value(
extract_raw_value(summary_detail.get("trailingPE")),
extract_raw_value(key_stats.get("trailingPE")),
),
"forward_pe": pick_value(
extract_raw_value(summary_detail.get("forwardPE")),
extract_raw_value(key_stats.get("forwardPE")),
),
"eps_trailing": extract_raw_value(key_stats.get("trailingEps")),
"eps_forward": extract_raw_value(key_stats.get("forwardEps")),
"dividend_rate": extract_raw_value(summary_detail.get("dividendRate")),
"dividend_yield": extract_raw_value(summary_detail.get("dividendYield")),
"ex_dividend_date": extract_raw_value(summary_detail.get("exDividendDate")),
"payout_ratio": extract_raw_value(summary_detail.get("payoutRatio")),
"implied_volatility": extract_raw_value(summary_detail.get("impliedVolatility")),
"current_price": pick_value(
extract_raw_value(price_data.get("regularMarketPrice")),
extract_raw_value(financial_data.get("currentPrice")),
extract_raw_value(quote.get("regularMarketPrice")),
),
"recommendation_key": financial_data.get("recommendationKey"),
"recommendation_mean": extract_raw_value(financial_data.get("recommendationMean")),
"target_price_high": extract_raw_value(financial_data.get("targetHighPrice")),
"target_price_low": extract_raw_value(financial_data.get("targetLowPrice")),
"target_price_mean": extract_raw_value(financial_data.get("targetMeanPrice")),
"target_price_median": extract_raw_value(financial_data.get("targetMedianPrice")),
"analyst_opinion_count": extract_raw_value(
financial_data.get("numberOfAnalystOpinions")
),
}
def simplify_recommendation_trend(trend):
simplified = []
for entry in trend or []:
simplified.append(
{
"period": entry.get("period"),
"strong_buy": entry.get("strongBuy"),
"buy": entry.get("buy"),
"hold": entry.get("hold"),
"sell": entry.get("sell"),
"strong_sell": entry.get("strongSell"),
}
)
return simplified
def simplify_upgrade_history(history, limit=20):
simplified = []
for entry in history or []:
simplified.append(
{
"firm": entry.get("firm"),
"action": entry.get("action"),
"from_grade": entry.get("fromGrade"),
"to_grade": entry.get("toGrade"),
"date": entry.get("epochGradeDate") or entry.get("gradeDate"),
}
)
if limit and len(simplified) >= limit:
break
return simplified
def simplify_ratings_top(payload):
if not payload:
return None
simplified = {}
for key, value in payload.items():
if not isinstance(value, dict):
continue
simplified[key] = {
"analyst": value.get("analyst"),
"rating_current": value.get("rating_current"),
"rating_sentiment": value.get("rating_sentiment"),
"pt_current": value.get("pt_current"),
"adjusted_pt_current": value.get("adjusted_pt_current"),
"announcement_date": value.get("announcement_date"),
"datapoints": value.get("datapoints"),
"scores": {
"dir": extract_value(value.get("dir")),
"mm": extract_value(value.get("mm")),
"pt": extract_value(value.get("pt")),
"fin_score": extract_value(value.get("fin_score")),
},
}
return simplified or None
def summarize_performance(perf_data):
if not perf_data:
return {}
overview = perf_data.get("performanceOverview")
if isinstance(overview, dict):
return {
"as_of_date": extract_value(overview.get("asOfDate")),
"returns": {
"five_day": extract_value(overview.get("fiveDaysReturn")),
"one_month": extract_value(overview.get("oneMonthReturn")),
"three_month": extract_value(overview.get("threeMonthReturn")),
"six_month": extract_value(overview.get("sixMonthReturn")),
"ytd": extract_value(overview.get("ytdReturnPct")),
"one_year": extract_value(overview.get("oneYearTotalReturn")),
"two_year": extract_value(overview.get("twoYearTotalReturn")),
"three_year": extract_value(overview.get("threeYearTotalReturn")),
"five_year": extract_value(overview.get("fiveYearTotalReturn")),
"ten_year": extract_value(overview.get("tenYearTotalReturn")),
"max": extract_value(overview.get("maxReturn")),
},
}
summary = []
for entry in overview or []:
if not isinstance(entry, dict):
continue
summary.append(
{
"period": entry.get("period"),
"performance": extract_value(entry.get("performance")),
"benchmark": extract_value(entry.get("benchmark")),
}
)
return {"periods": summary} if summary else {}
def summarize_earnings(earnings, calendar_events):
earnings = earnings or {}
calendar_events = calendar_events or {}
earnings_chart = earnings.get("earningsChart", {}) or {}
financials_chart = earnings.get("financialsChart", {}) or {}
calendar_earnings = calendar_events.get("earnings", {}) or {}
quarterly = []
for entry in earnings_chart.get("quarterly") or []:
quarterly.append(
{
"quarter": entry.get("date"),
"actual": extract_value(entry.get("actual")),
"estimate": extract_value(entry.get("estimate")),
"surprise": extract_value(entry.get("difference")),
"surprise_percent": extract_value(entry.get("surprisePct")),
}
)
yearly = []
for entry in financials_chart.get("yearly") or []:
yearly.append(
{
"year": entry.get("date"),
"revenue": extract_value(entry.get("revenue")),
"earnings": extract_value(entry.get("earnings")),
}
)
quarterly_financials = []
for entry in financials_chart.get("quarterly") or []:
quarterly_financials.append(
{
"quarter": entry.get("date"),
"revenue": extract_value(entry.get("revenue")),
"earnings": extract_value(entry.get("earnings")),
}
)
return {
"next_earnings_dates": [
extract_value(value) for value in calendar_earnings.get("earningsDate", []) or []
],
"is_earnings_date_estimate": calendar_earnings.get("isEarningsDateEstimate"),
"earnings_estimates": {
"average": extract_value(calendar_earnings.get("earningsAverage")),
"low": extract_value(calendar_earnings.get("earningsLow")),
"high": extract_value(calendar_earnings.get("earningsHigh")),
},
"revenue_estimates": {
"average": extract_value(calendar_earnings.get("revenueAverage")),
"low": extract_value(calendar_earnings.get("revenueLow")),
"high": extract_value(calendar_earnings.get("revenueHigh")),
},
"quarterly_earnings": quarterly[:4],
"yearly_financials": yearly[:4],
"quarterly_financials": quarterly_financials[:4],
"current_quarter_estimate": extract_value(
earnings_chart.get("currentQuarterEstimate")
),
"current_quarter_estimate_date": earnings_chart.get("currentQuarterEstimateDate"),
"current_calendar_quarter": earnings_chart.get("currentCalendarQuarter"),
"current_fiscal_quarter": earnings_chart.get("currentFiscalQuarter"),
}
def scrape_yahoo_profile(symbol):
encoded = urllib.parse.quote(symbol, safe="")
url = f"https://finance.yahoo.com/quote/{encoded}/"
app.logger.info("Starting profile scrape for symbol=%s url=%s", symbol, url)
response_html = None
rendered_html = None
payloads = {}
with sync_playwright() as p:
launch_args = chromium_launch_args()
if launch_args:
app.logger.info("GPU acceleration enabled")
else:
app.logger.info("GPU acceleration disabled")
browser = p.chromium.launch(headless=True, args=launch_args)
page = browser.new_page()
page.set_extra_http_headers(
{
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"
)
}
)
page.set_default_timeout(60000)
try:
response = page.goto(url, wait_until="domcontentloaded", timeout=60000)
app.logger.info("Profile page loaded (domcontentloaded) for %s", symbol)
if response:
response_html = response.text()
else:
app.logger.warning("No response body for profile page %s", symbol)
page.wait_for_timeout(1000)
rendered_html = page.content()
finally:
browser.close()
if not response_html and not rendered_html:
return {"error": "Profile page content missing", "stock": symbol, "url": url}
payload_source = response_html or rendered_html
payload_soup = BeautifulSoup(payload_source, "html.parser") if payload_source else None
payloads = extract_sveltekit_payloads_from_soup(payload_soup)
if not payloads and rendered_html and rendered_html != payload_source:
fallback_soup = BeautifulSoup(rendered_html, "html.parser")
payloads = extract_sveltekit_payloads_from_soup(fallback_soup)
if not payloads:
return {
"error": "No embedded payloads found on profile page",
"stock": symbol,
"url": url,
}
quote_summary_url, quote_summary_payload = select_payload(
payloads, "quoteSummary", symbol
)
quote_url, quote_payload = select_payload(payloads, "v7/finance/quote?", symbol)
quote_type_url, quote_type_payload = select_payload(
payloads, "/v1/finance/quoteType/", symbol
)
ratings_url, ratings_payload = select_payload(payloads, "ratings/top", symbol)
recs_url, recs_payload = select_payload(
payloads, "recommendationsbysymbol", symbol
)
quote_summary = extract_quote_summary(quote_summary_payload)
quote = extract_quote_response(quote_payload)
quote_type = extract_quote_type(quote_type_payload)
summary_detail = quote_summary.get("summaryDetail", {}) if quote_summary else {}
key_stats = quote_summary.get("defaultKeyStatistics", {}) if quote_summary else {}
financial_data = quote_summary.get("financialData", {}) if quote_summary else {}
price_data = quote_summary.get("price", {}) if quote_summary else {}
recommendation_trend = (
quote_summary.get("recommendationTrend", {}) if quote_summary else {}
)
upgrade_history = (
quote_summary.get("upgradeDowngradeHistory", {}) if quote_summary else {}
)
earnings = quote_summary.get("earnings", {}) if quote_summary else {}
calendar_events = quote_summary.get("calendarEvents", {}) if quote_summary else {}
equity_performance = (
quote_summary.get("equityPerformance", {}) if quote_summary else {}
)
performance_overview = (
quote_summary.get("quoteUnadjustedPerformanceOverview", {})
if quote_summary
else {}
)
key_metrics = build_profile_key_metrics(
summary_detail, key_stats, financial_data, price_data, quote
)
valuation = {
"market_cap": extract_raw_value(key_stats.get("marketCap")),
"enterprise_value": extract_raw_value(key_stats.get("enterpriseValue")),
"price_to_book": extract_raw_value(key_stats.get("priceToBook")),
"price_to_sales": extract_raw_value(key_stats.get("priceToSalesTrailing12Months")),
"trailing_pe": key_metrics.get("trailing_pe"),
"forward_pe": key_metrics.get("forward_pe"),
}
profitability = {
"profit_margins": extract_raw_value(financial_data.get("profitMargins")),
"operating_margins": extract_raw_value(financial_data.get("operatingMargins")),
"gross_margins": extract_raw_value(financial_data.get("grossMargins")),
"ebitda_margins": extract_raw_value(financial_data.get("ebitdaMargins")),
"return_on_assets": extract_raw_value(financial_data.get("returnOnAssets")),
"return_on_equity": extract_raw_value(financial_data.get("returnOnEquity")),
}
growth = {
"revenue_growth": extract_raw_value(financial_data.get("revenueGrowth")),
"earnings_growth": extract_raw_value(financial_data.get("earningsGrowth")),
"revenue_per_share": extract_raw_value(financial_data.get("revenuePerShare")),
}
financial_strength = {
"total_cash": extract_raw_value(financial_data.get("totalCash")),
"total_debt": extract_raw_value(financial_data.get("totalDebt")),
"debt_to_equity": extract_raw_value(financial_data.get("debtToEquity")),
"current_ratio": extract_raw_value(financial_data.get("currentRatio")),
"quick_ratio": extract_raw_value(financial_data.get("quickRatio")),
}
cashflow = {
"operating_cashflow": extract_raw_value(financial_data.get("operatingCashflow")),
"free_cashflow": extract_raw_value(financial_data.get("freeCashflow")),
"ebitda": extract_raw_value(financial_data.get("ebitda")),
}
ownership = {
"shares_outstanding": extract_raw_value(key_stats.get("sharesOutstanding")),
"float_shares": extract_raw_value(key_stats.get("floatShares")),
"shares_short": extract_raw_value(key_stats.get("sharesShort")),
"short_ratio": extract_raw_value(key_stats.get("shortRatio")),
"short_percent_of_float": extract_raw_value(key_stats.get("shortPercentOfFloat")),
"held_percent_insiders": extract_raw_value(key_stats.get("heldPercentInsiders")),
"held_percent_institutions": extract_raw_value(
key_stats.get("heldPercentInstitutions")
),
}
analyst = {
"recommendation": {
"key": key_metrics.get("recommendation_key"),
"mean": key_metrics.get("recommendation_mean"),
"analyst_opinion_count": key_metrics.get("analyst_opinion_count"),
"target_price_high": key_metrics.get("target_price_high"),
"target_price_low": key_metrics.get("target_price_low"),
"target_price_mean": key_metrics.get("target_price_mean"),
"target_price_median": key_metrics.get("target_price_median"),
},
"trend": simplify_recommendation_trend(recommendation_trend.get("trend")),
"upgrades_downgrades": simplify_upgrade_history(
upgrade_history.get("history"), limit=20
),
"ratings_top": simplify_ratings_top(ratings_payload),
}
earnings_summary = summarize_earnings(earnings, calendar_events)
performance_summary = {
"equity_performance": summarize_performance(equity_performance),
"unadjusted_performance": summarize_performance(performance_overview),
}
matched_symbols = []
for candidate in [
price_data.get("symbol") if price_data else None,
quote.get("symbol") if quote else None,
quote_type.get("symbol") if quote_type else None,
]:
if candidate:
matched_symbols.append(candidate)
symbol_match = None
if matched_symbols:
symbol_match = any(
candidate.upper() == symbol.upper() for candidate in matched_symbols
)
issues = []
if not quote_summary:
issues.append("missing_quote_summary")
if matched_symbols and not symbol_match:
issues.append("symbol_mismatch")
if not quote:
issues.append("missing_quote_data")
if not quote_type:
issues.append("missing_quote_type")
validation = {
"requested_symbol": symbol,
"matched_symbols": matched_symbols,
"symbol_match": symbol_match,
"issues": issues,
}
if "missing_quote_summary" in issues or "symbol_mismatch" in issues:
return {
"error": "Profile validation failed",
"stock": symbol,
"url": url,
"validation": validation,
"data_sources": {
"quote_summary": quote_summary_url,
"quote": quote_url,
"quote_type": quote_type_url,
"ratings_top": ratings_url,
"recommendations": recs_url,
},
}
return {
"stock": symbol,
"url": url,
"fetched_at": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
"validation": validation,
"key_metrics": key_metrics,
"valuation": valuation,
"profitability": profitability,
"growth": growth,
"financial_strength": financial_strength,
"cashflow": cashflow,
"ownership": ownership,
"analyst": analyst,
"earnings": earnings_summary,
"performance": performance_summary,
"data_sources": {
"quote_summary": quote_summary_url,
"quote": quote_url,
"quote_type": quote_type_url,
"ratings_top": ratings_url,
"recommendations": recs_url,
},
}
def scrape_yahoo_options(symbol, expiration=None, strike_limit=25): def scrape_yahoo_options(symbol, expiration=None, strike_limit=25):
def parse_table(table_html, side): def parse_table(table_html, side):
if not table_html: if not table_html:
@@ -656,7 +1279,10 @@ def scrape_yahoo_options(symbol, expiration=None, strike_limit=25):
@app.route("/scrape_sync") @app.route("/scrape_sync")
def scrape_sync(): def scrape_sync():
symbol = request.args.get("stock", "MSFT") symbol = request.args.get("stock")
if not symbol:
return jsonify({"error": "Missing 'stock' parameter"}), 400
expiration = ( expiration = (
request.args.get("expiration") request.args.get("expiration")
or request.args.get("expiry") or request.args.get("expiry")
@@ -672,5 +1298,173 @@ def scrape_sync():
return jsonify(scrape_yahoo_options(symbol, expiration, strike_limit)) return jsonify(scrape_yahoo_options(symbol, expiration, strike_limit))
@app.route("/profile")
def profile():
symbol = request.args.get("stock")
if not symbol:
return jsonify({"error": "Missing 'stock' parameter"}), 400
app.logger.info("Received /profile request for symbol=%s", symbol)
return jsonify(scrape_yahoo_profile(symbol))
def scrape_truths_sync(count=10, handle="realDonaldTrump"):
app.logger.info("Starting Truth Social scrape for handle=%s count=%d", handle, count)
with sync_playwright() as p:
launch_args = chromium_launch_args()
if launch_args:
app.logger.info("GPU acceleration enabled for Truth Social")
browser = p.chromium.launch(headless=True, args=launch_args)
context = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
viewport={'width': 1920, 'height': 2000}
)
page = context.new_page()
try:
url = f"https://truthsocial.com/@{handle}"
app.logger.info("Navigating to %s", url)
page.goto(url, wait_until="domcontentloaded", timeout=60000)
# Wait for content to load
page.wait_for_timeout(5000)
# Scale down to fit more content
page.evaluate("document.body.style.zoom = '0.7'")
page.wait_for_timeout(2000)
# Handle potential modal/ad overlay
try:
close_btn = page.query_selector('[data-testid="close-modal"]')
if close_btn:
close_btn.click()
page.wait_for_timeout(1000)
page.keyboard.press("Escape")
except Exception:
pass
# Wait for any status to appear
selector = '[data-testid="status"]'
try:
page.wait_for_selector(selector, timeout=20000)
except Exception:
selector = '[data-id]'
page.wait_for_selector(selector, timeout=10000)
truths_data = []
seen_ids = set()
# Since virtual lists only render what is near the scroll position,
# we need a few small scrolls even with a tall viewport.
for scroll_step in range(10):
# Get all current statuses
statuses = page.query_selector_all(selector)
for status in statuses:
if len(truths_data) >= count:
break
try:
# Find post ID and validate it belongs to the target handle
links = status.query_selector_all('a')
post_id = None
for link in links:
href = link.get_attribute('href')
if href and f"/@{handle}/posts/" in href:
post_id = href
break
if not post_id or post_id in seen_ids:
continue
seen_ids.add(post_id)
# Content
content_el = status.query_selector('[data-testid="status-content"]')
if not content_el:
content_el = status.query_selector('[data-testid="markup"]')
content_text = content_el.inner_text() if content_el else ""
# Time
time_el = status.query_selector('time')
time_text = time_el.get_attribute('title') if time_el else ""
if not time_text and time_el:
time_text = time_el.inner_text()
# Counts
def get_btn_text(btn_selector):
btn = status.query_selector(btn_selector)
return btn.inner_text() if btn else "0"
reply_count = get_btn_text('button[aria-label="Reply"]')
retruth_count = get_btn_text('button[aria-label="ReTruth"]')
like_count = get_btn_text('button[aria-label="Like"]')
# Media
media_urls = []
imgs = status.query_selector_all('img')
for img in imgs:
alt = img.get_attribute('alt')
if alt in ["Avatar", "Profile header", "Logo", "Verified Account"]:
continue
src = img.get_attribute('src')
if src and ("static-assets" in src or "proxy" in src):
media_urls.append(src)
videos = status.query_selector_all('video')
for video in videos:
src = video.get_attribute('src')
if not src:
source_tag = video.query_selector('source')
if source_tag: src = source_tag.get_attribute('src')
if src: media_urls.append(src)
def clean(c):
return str(c).strip().replace('\n', '')
truths_data.append({
"id": post_id,
"content": content_text,
"time": time_text,
"likes_count": clean(like_count),
"comments_count": clean(reply_count),
"retruths_count": clean(retruth_count),
"media": list(set(media_urls))
})
except Exception:
continue
if len(truths_data) >= count:
break
# Scroll a bit to trigger next items
page.evaluate("window.scrollBy(0, 500)")
page.wait_for_timeout(1000)
app.logger.info("Scraped %d truths", len(truths_data))
return truths_data[:count]
except Exception as e:
app.logger.error("Truths scraper error: %s", e)
return {"error": str(e)}
finally:
browser.close()
@app.route("/truths")
def truths():
try:
count = int(request.args.get("count", 10))
except ValueError:
count = 10
handle = request.args.get("handle", "realDonaldTrump")
app.logger.info("Received /truths request for handle=%s count=%d", handle, count)
return jsonify(scrape_truths_sync(count, handle))
if __name__ == "__main__": if __name__ == "__main__":
app.run(host="0.0.0.0", port=9777) app.run(host="0.0.0.0", port=9777)

View File

@@ -0,0 +1,145 @@
import argparse
import json
import sys
import time
import urllib.parse
import urllib.request
DEFAULT_SYMBOLS = ["AAPL", "AMZN", "MSFT", "TSLA"]
REQUIRED_SECTIONS = [
"key_metrics",
"valuation",
"profitability",
"growth",
"financial_strength",
"cashflow",
"ownership",
"analyst",
"earnings",
"performance",
]
REQUIRED_KEY_METRICS = [
"previous_close",
"open",
"bid",
"ask",
"beta",
"eps_trailing",
"dividend_rate",
"current_price",
]
def http_get(base_url, params, timeout):
query = urllib.parse.urlencode(params)
url = f"{base_url}?{query}"
with urllib.request.urlopen(url, timeout=timeout) as resp:
return json.loads(resp.read().decode("utf-8"))
def parse_list(value, default):
if not value:
return default
return [item.strip() for item in value.split(",") if item.strip()]
def build_signature(data):
return {
"key_metrics_keys": sorted(data.get("key_metrics", {}).keys()),
"valuation_keys": sorted(data.get("valuation", {}).keys()),
"profitability_keys": sorted(data.get("profitability", {}).keys()),
"growth_keys": sorted(data.get("growth", {}).keys()),
"financial_strength_keys": sorted(data.get("financial_strength", {}).keys()),
"cashflow_keys": sorted(data.get("cashflow", {}).keys()),
"ownership_keys": sorted(data.get("ownership", {}).keys()),
"analyst_keys": sorted(data.get("analyst", {}).keys()),
"earnings_keys": sorted(data.get("earnings", {}).keys()),
"performance_keys": sorted(data.get("performance", {}).keys()),
}
def validate_payload(symbol, data):
if "error" in data:
return f"API error for {symbol}: {data}"
if data.get("stock", "").upper() != symbol.upper():
return f"Symbol mismatch: expected {symbol} got {data.get('stock')}"
validation = data.get("validation", {})
if validation.get("symbol_match") is not True:
return f"Validation symbol_match failed for {symbol}: {validation}"
if validation.get("issues"):
return f"Validation issues for {symbol}: {validation}"
for section in REQUIRED_SECTIONS:
if section not in data:
return f"Missing section {section} for {symbol}"
key_metrics = data.get("key_metrics", {})
for field in REQUIRED_KEY_METRICS:
if field not in key_metrics:
return f"Missing key metric {field} for {symbol}"
return None
def main():
parser = argparse.ArgumentParser(description="Yahoo profile scraper test cycles")
parser.add_argument(
"--base-url",
default="http://127.0.0.1:9777/profile",
help="Base URL for /profile",
)
parser.add_argument(
"--symbols",
default=",".join(DEFAULT_SYMBOLS),
help="Comma-separated stock symbols",
)
parser.add_argument(
"--runs",
type=int,
default=8,
help="Number of validation runs per symbol",
)
parser.add_argument(
"--timeout",
type=int,
default=180,
help="Request timeout in seconds",
)
parser.add_argument(
"--sleep",
type=float,
default=0.2,
help="Sleep between requests",
)
args = parser.parse_args()
symbols = parse_list(args.symbols, DEFAULT_SYMBOLS)
signatures = {}
print(f"Running {args.runs} profile cycles for: {', '.join(symbols)}")
for run in range(1, args.runs + 1):
print(f"Cycle {run}/{args.runs}")
for symbol in symbols:
data = http_get(args.base_url, {"stock": symbol}, args.timeout)
error = validate_payload(symbol, data)
if error:
print(f"ERROR: {error}")
sys.exit(1)
signature = build_signature(data)
if symbol not in signatures:
signatures[symbol] = signature
elif signatures[symbol] != signature:
print(f"ERROR: Signature changed for {symbol}")
print(f"Baseline: {signatures[symbol]}")
print(f"Current: {signature}")
sys.exit(1)
time.sleep(args.sleep)
print(f"Cycle {run} OK")
print("\nAll profile cycles completed successfully.")
if __name__ == "__main__":
main()