Compare commits

...

4 Commits

Author SHA1 Message Date
Rushabh Gosar
83a5e843c0 Feature: Merge Truth Social scraper logic into SimpleScraper 2026-01-09 18:30:39 -08:00
Rushabh Gosar
4e02c6ce0a Fix: Remove default MSFT symbol to prevent silent data errors 2026-01-09 17:32:10 -08:00
Rushabh Gosar
c01a98abce Prune profile payload for options thesis 2025-12-29 12:27:30 -08:00
Rushabh Gosar
68805ed80a Add profile endpoint and validation 2025-12-29 00:45:13 -08:00
3 changed files with 964 additions and 1 deletions

View File

@@ -16,6 +16,13 @@
- If `strikeLimit` is greater than available strikes, all available rows are returned.
- `pruned_calls_count` and `pruned_puts_count` report how many rows were removed beyond the limit.
- `selected_expiration` reports the resolved expiry (epoch + label), and mismatches return an error.
- Route: `GET /profile`
- Query params:
- `stock`: symbol (default `MSFT`).
- Behavior:
- Loads `https://finance.yahoo.com/quote/<SYMBOL>/` with Playwright.
- Pulls the embedded SvelteKit payloads (quoteSummary, quote, quoteType, ratings, recommendations).
- Returns a pruned JSON with valuation, profitability, growth, financial strength, cashflow, ownership, analyst, earnings, and performance summaries.
## Guard Rails
- Run local 10-cycle validation (4 stocks x 4 expiries) before any deploy or push.
@@ -28,9 +35,26 @@
- Local server:
- Start: `.\venv\Scripts\python.exe scraper_service.py`
- Validate: `python scripts/test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
- Profile validation (local server):
- Validate: `python scripts/test_profile_cycles.py --base-url http://127.0.0.1:9777/profile --runs 8`
- Docker server:
- Start: `docker run --rm -p 9777:9777 rushabhtechie/yahoo-scraper:latest`
- Validate: `python scripts/test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
- Profile validation (docker server):
- Validate: `python scripts/test_profile_cycles.py --base-url http://127.0.0.1:9777/profile --runs 8`
## Update Log (2025-12-28)
- Added `/profile` endpoint backed by SvelteKit payload parsing (quoteSummary, quote, quoteType, ratings, recommendations).
- `/profile` response trimmed to focus on valuation, profitability, growth, financial strength, cashflow, ownership, analyst, earnings, and performance summaries.
- Validation ensures quote data matches the requested symbol, with issues reported in `validation`.
- Issue encountered: existing server instance bound to port 9777 without `/profile`, resolved by restarting the service with the updated script.
- Tests executed (local):
- `.\venv\Scripts\python.exe scripts/test_profile_cycles.py --runs 8 --timeout 180`
- `.\venv\Scripts\python.exe scripts\test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
- Tests executed (docker):
- `docker build -t rushabhtechie/yahoo-scraper:latest .`
- `.\venv\Scripts\python.exe scripts\test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
- `.\venv\Scripts\python.exe scripts\test_profile_cycles.py --base-url http://127.0.0.1:9777/profile --runs 8 --timeout 180`
- The test harness verifies:
- Requested expiration matches `selected_expiration.value`.
- Contract symbols include the expected YYMMDD code.

View File

@@ -2,6 +2,7 @@ from flask import Flask, jsonify, request
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
from datetime import datetime, timezone
import html
import urllib.parse
import logging
import json
@@ -105,6 +106,14 @@ def extract_raw_value(value):
return value
def extract_value(value):
if isinstance(value, dict):
if value.get("raw") is not None:
return value.get("raw")
return value.get("fmt")
return value
def extract_fmt_value(value):
if isinstance(value, dict):
return value.get("fmt")
@@ -353,6 +362,620 @@ def parse_strike_limit(value, default=25):
return limit if limit > 0 else default
def parse_sveltekit_payload(raw_text):
if not raw_text:
return None
try:
outer = json.loads(raw_text)
except json.JSONDecodeError:
return None
body = outer.get("body")
if isinstance(body, str):
try:
body = json.loads(body)
except json.JSONDecodeError:
pass
return {
"status": outer.get("status"),
"statusText": outer.get("statusText"),
"body": body,
}
def extract_sveltekit_payloads_from_soup(soup):
payloads = {}
if soup is None:
return payloads
scripts = soup.select('script[type="application/json"][data-sveltekit-fetched]')
for script in scripts:
url = script.get("data-url")
if not url:
continue
url = html.unescape(url)
raw_text = script.string or script.get_text()
payload = parse_sveltekit_payload(raw_text)
if not payload:
continue
payloads[url] = payload
return payloads
def select_payload(payloads, needle, symbol=None):
if not payloads:
return None, None
needle = needle.lower()
symbol_token = symbol.lower() if symbol else None
fallback = None
for url, payload in payloads.items():
url_lower = url.lower()
if needle not in url_lower:
continue
if symbol_token:
if f"/{symbol_token}" in url_lower or f"symbols={symbol_token}" in url_lower:
return url, payload.get("body")
if fallback is None:
fallback = (url, payload.get("body"))
return fallback if fallback else (None, None)
def extract_quote_summary(payload):
if not payload:
return None
summary = payload.get("quoteSummary")
if not summary:
return None
result = summary.get("result") or []
return result[0] if result else None
def extract_quote_response(payload):
if not payload:
return None
response = payload.get("quoteResponse")
if not response:
return None
result = response.get("result") or []
return result[0] if result else None
def extract_quote_type(payload):
if not payload:
return None
quote_type = payload.get("quoteType")
if not quote_type:
return None
result = quote_type.get("result") or []
return result[0] if result else None
def extract_recent_news_from_soup(soup, limit=20):
items = []
if soup is None:
return items
container = soup.select_one('[data-testid="recent-news"]')
root = container if container else soup
seen = set()
for item in root.select('[data-testid="storyitem"]'):
title_el = item.select_one("h3")
link_el = item.select_one("a[href]")
if not title_el and not link_el:
continue
title = title_el.get_text(strip=True) if title_el else None
link = link_el.get("href") if link_el else None
publisher = None
published = None
publishing = item.select_one(".publishing")
if publishing:
text = " ".join(publishing.stripped_strings)
if "\u2022" in text:
parts = [part.strip() for part in text.split("\u2022", 1)]
publisher = parts[0] or None
published = parts[1] if len(parts) > 1 else None
else:
publisher = text or None
key = link or title
if key and key in seen:
continue
if key:
seen.add(key)
items.append(
{
"title": title,
"publisher": publisher,
"published": published,
"link": link,
}
)
if limit and len(items) >= limit:
break
return items
def extract_news_summary_from_soup(soup):
if soup is None:
return None
summary = soup.select_one('[data-testid="ticker-news-summary"]')
if not summary:
return None
text = " ".join(summary.stripped_strings)
return text if text else None
def build_profile_key_metrics(summary_detail, key_stats, financial_data, price_data, quote):
summary_detail = summary_detail or {}
key_stats = key_stats or {}
financial_data = financial_data or {}
price_data = price_data or {}
quote = quote or {}
def pick_value(*values):
for value in values:
if value is not None:
return value
return None
return {
"previous_close": extract_raw_value(summary_detail.get("previousClose")),
"open": extract_raw_value(summary_detail.get("open")),
"bid": extract_raw_value(summary_detail.get("bid")),
"ask": extract_raw_value(summary_detail.get("ask")),
"bid_size": extract_raw_value(summary_detail.get("bidSize")),
"ask_size": extract_raw_value(summary_detail.get("askSize")),
"day_low": extract_raw_value(summary_detail.get("dayLow")),
"day_high": extract_raw_value(summary_detail.get("dayHigh")),
"fifty_two_week_low": extract_raw_value(quote.get("fiftyTwoWeekLow")),
"fifty_two_week_high": extract_raw_value(quote.get("fiftyTwoWeekHigh")),
"volume": pick_value(
extract_raw_value(summary_detail.get("volume")),
extract_raw_value(price_data.get("regularMarketVolume")),
extract_raw_value(quote.get("regularMarketVolume")),
),
"average_volume": pick_value(
extract_raw_value(summary_detail.get("averageVolume")),
extract_raw_value(price_data.get("averageDailyVolume3Month")),
),
"market_cap": pick_value(
extract_raw_value(summary_detail.get("marketCap")),
extract_raw_value(quote.get("marketCap")),
),
"beta": pick_value(
extract_raw_value(summary_detail.get("beta")),
extract_raw_value(key_stats.get("beta")),
),
"trailing_pe": pick_value(
extract_raw_value(summary_detail.get("trailingPE")),
extract_raw_value(key_stats.get("trailingPE")),
),
"forward_pe": pick_value(
extract_raw_value(summary_detail.get("forwardPE")),
extract_raw_value(key_stats.get("forwardPE")),
),
"eps_trailing": extract_raw_value(key_stats.get("trailingEps")),
"eps_forward": extract_raw_value(key_stats.get("forwardEps")),
"dividend_rate": extract_raw_value(summary_detail.get("dividendRate")),
"dividend_yield": extract_raw_value(summary_detail.get("dividendYield")),
"ex_dividend_date": extract_raw_value(summary_detail.get("exDividendDate")),
"payout_ratio": extract_raw_value(summary_detail.get("payoutRatio")),
"implied_volatility": extract_raw_value(summary_detail.get("impliedVolatility")),
"current_price": pick_value(
extract_raw_value(price_data.get("regularMarketPrice")),
extract_raw_value(financial_data.get("currentPrice")),
extract_raw_value(quote.get("regularMarketPrice")),
),
"recommendation_key": financial_data.get("recommendationKey"),
"recommendation_mean": extract_raw_value(financial_data.get("recommendationMean")),
"target_price_high": extract_raw_value(financial_data.get("targetHighPrice")),
"target_price_low": extract_raw_value(financial_data.get("targetLowPrice")),
"target_price_mean": extract_raw_value(financial_data.get("targetMeanPrice")),
"target_price_median": extract_raw_value(financial_data.get("targetMedianPrice")),
"analyst_opinion_count": extract_raw_value(
financial_data.get("numberOfAnalystOpinions")
),
}
def simplify_recommendation_trend(trend):
simplified = []
for entry in trend or []:
simplified.append(
{
"period": entry.get("period"),
"strong_buy": entry.get("strongBuy"),
"buy": entry.get("buy"),
"hold": entry.get("hold"),
"sell": entry.get("sell"),
"strong_sell": entry.get("strongSell"),
}
)
return simplified
def simplify_upgrade_history(history, limit=20):
simplified = []
for entry in history or []:
simplified.append(
{
"firm": entry.get("firm"),
"action": entry.get("action"),
"from_grade": entry.get("fromGrade"),
"to_grade": entry.get("toGrade"),
"date": entry.get("epochGradeDate") or entry.get("gradeDate"),
}
)
if limit and len(simplified) >= limit:
break
return simplified
def simplify_ratings_top(payload):
if not payload:
return None
simplified = {}
for key, value in payload.items():
if not isinstance(value, dict):
continue
simplified[key] = {
"analyst": value.get("analyst"),
"rating_current": value.get("rating_current"),
"rating_sentiment": value.get("rating_sentiment"),
"pt_current": value.get("pt_current"),
"adjusted_pt_current": value.get("adjusted_pt_current"),
"announcement_date": value.get("announcement_date"),
"datapoints": value.get("datapoints"),
"scores": {
"dir": extract_value(value.get("dir")),
"mm": extract_value(value.get("mm")),
"pt": extract_value(value.get("pt")),
"fin_score": extract_value(value.get("fin_score")),
},
}
return simplified or None
def summarize_performance(perf_data):
if not perf_data:
return {}
overview = perf_data.get("performanceOverview")
if isinstance(overview, dict):
return {
"as_of_date": extract_value(overview.get("asOfDate")),
"returns": {
"five_day": extract_value(overview.get("fiveDaysReturn")),
"one_month": extract_value(overview.get("oneMonthReturn")),
"three_month": extract_value(overview.get("threeMonthReturn")),
"six_month": extract_value(overview.get("sixMonthReturn")),
"ytd": extract_value(overview.get("ytdReturnPct")),
"one_year": extract_value(overview.get("oneYearTotalReturn")),
"two_year": extract_value(overview.get("twoYearTotalReturn")),
"three_year": extract_value(overview.get("threeYearTotalReturn")),
"five_year": extract_value(overview.get("fiveYearTotalReturn")),
"ten_year": extract_value(overview.get("tenYearTotalReturn")),
"max": extract_value(overview.get("maxReturn")),
},
}
summary = []
for entry in overview or []:
if not isinstance(entry, dict):
continue
summary.append(
{
"period": entry.get("period"),
"performance": extract_value(entry.get("performance")),
"benchmark": extract_value(entry.get("benchmark")),
}
)
return {"periods": summary} if summary else {}
def summarize_earnings(earnings, calendar_events):
earnings = earnings or {}
calendar_events = calendar_events or {}
earnings_chart = earnings.get("earningsChart", {}) or {}
financials_chart = earnings.get("financialsChart", {}) or {}
calendar_earnings = calendar_events.get("earnings", {}) or {}
quarterly = []
for entry in earnings_chart.get("quarterly") or []:
quarterly.append(
{
"quarter": entry.get("date"),
"actual": extract_value(entry.get("actual")),
"estimate": extract_value(entry.get("estimate")),
"surprise": extract_value(entry.get("difference")),
"surprise_percent": extract_value(entry.get("surprisePct")),
}
)
yearly = []
for entry in financials_chart.get("yearly") or []:
yearly.append(
{
"year": entry.get("date"),
"revenue": extract_value(entry.get("revenue")),
"earnings": extract_value(entry.get("earnings")),
}
)
quarterly_financials = []
for entry in financials_chart.get("quarterly") or []:
quarterly_financials.append(
{
"quarter": entry.get("date"),
"revenue": extract_value(entry.get("revenue")),
"earnings": extract_value(entry.get("earnings")),
}
)
return {
"next_earnings_dates": [
extract_value(value) for value in calendar_earnings.get("earningsDate", []) or []
],
"is_earnings_date_estimate": calendar_earnings.get("isEarningsDateEstimate"),
"earnings_estimates": {
"average": extract_value(calendar_earnings.get("earningsAverage")),
"low": extract_value(calendar_earnings.get("earningsLow")),
"high": extract_value(calendar_earnings.get("earningsHigh")),
},
"revenue_estimates": {
"average": extract_value(calendar_earnings.get("revenueAverage")),
"low": extract_value(calendar_earnings.get("revenueLow")),
"high": extract_value(calendar_earnings.get("revenueHigh")),
},
"quarterly_earnings": quarterly[:4],
"yearly_financials": yearly[:4],
"quarterly_financials": quarterly_financials[:4],
"current_quarter_estimate": extract_value(
earnings_chart.get("currentQuarterEstimate")
),
"current_quarter_estimate_date": earnings_chart.get("currentQuarterEstimateDate"),
"current_calendar_quarter": earnings_chart.get("currentCalendarQuarter"),
"current_fiscal_quarter": earnings_chart.get("currentFiscalQuarter"),
}
def scrape_yahoo_profile(symbol):
encoded = urllib.parse.quote(symbol, safe="")
url = f"https://finance.yahoo.com/quote/{encoded}/"
app.logger.info("Starting profile scrape for symbol=%s url=%s", symbol, url)
response_html = None
rendered_html = None
payloads = {}
with sync_playwright() as p:
launch_args = chromium_launch_args()
if launch_args:
app.logger.info("GPU acceleration enabled")
else:
app.logger.info("GPU acceleration disabled")
browser = p.chromium.launch(headless=True, args=launch_args)
page = browser.new_page()
page.set_extra_http_headers(
{
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"
)
}
)
page.set_default_timeout(60000)
try:
response = page.goto(url, wait_until="domcontentloaded", timeout=60000)
app.logger.info("Profile page loaded (domcontentloaded) for %s", symbol)
if response:
response_html = response.text()
else:
app.logger.warning("No response body for profile page %s", symbol)
page.wait_for_timeout(1000)
rendered_html = page.content()
finally:
browser.close()
if not response_html and not rendered_html:
return {"error": "Profile page content missing", "stock": symbol, "url": url}
payload_source = response_html or rendered_html
payload_soup = BeautifulSoup(payload_source, "html.parser") if payload_source else None
payloads = extract_sveltekit_payloads_from_soup(payload_soup)
if not payloads and rendered_html and rendered_html != payload_source:
fallback_soup = BeautifulSoup(rendered_html, "html.parser")
payloads = extract_sveltekit_payloads_from_soup(fallback_soup)
if not payloads:
return {
"error": "No embedded payloads found on profile page",
"stock": symbol,
"url": url,
}
quote_summary_url, quote_summary_payload = select_payload(
payloads, "quoteSummary", symbol
)
quote_url, quote_payload = select_payload(payloads, "v7/finance/quote?", symbol)
quote_type_url, quote_type_payload = select_payload(
payloads, "/v1/finance/quoteType/", symbol
)
ratings_url, ratings_payload = select_payload(payloads, "ratings/top", symbol)
recs_url, recs_payload = select_payload(
payloads, "recommendationsbysymbol", symbol
)
quote_summary = extract_quote_summary(quote_summary_payload)
quote = extract_quote_response(quote_payload)
quote_type = extract_quote_type(quote_type_payload)
summary_detail = quote_summary.get("summaryDetail", {}) if quote_summary else {}
key_stats = quote_summary.get("defaultKeyStatistics", {}) if quote_summary else {}
financial_data = quote_summary.get("financialData", {}) if quote_summary else {}
price_data = quote_summary.get("price", {}) if quote_summary else {}
recommendation_trend = (
quote_summary.get("recommendationTrend", {}) if quote_summary else {}
)
upgrade_history = (
quote_summary.get("upgradeDowngradeHistory", {}) if quote_summary else {}
)
earnings = quote_summary.get("earnings", {}) if quote_summary else {}
calendar_events = quote_summary.get("calendarEvents", {}) if quote_summary else {}
equity_performance = (
quote_summary.get("equityPerformance", {}) if quote_summary else {}
)
performance_overview = (
quote_summary.get("quoteUnadjustedPerformanceOverview", {})
if quote_summary
else {}
)
key_metrics = build_profile_key_metrics(
summary_detail, key_stats, financial_data, price_data, quote
)
valuation = {
"market_cap": extract_raw_value(key_stats.get("marketCap")),
"enterprise_value": extract_raw_value(key_stats.get("enterpriseValue")),
"price_to_book": extract_raw_value(key_stats.get("priceToBook")),
"price_to_sales": extract_raw_value(key_stats.get("priceToSalesTrailing12Months")),
"trailing_pe": key_metrics.get("trailing_pe"),
"forward_pe": key_metrics.get("forward_pe"),
}
profitability = {
"profit_margins": extract_raw_value(financial_data.get("profitMargins")),
"operating_margins": extract_raw_value(financial_data.get("operatingMargins")),
"gross_margins": extract_raw_value(financial_data.get("grossMargins")),
"ebitda_margins": extract_raw_value(financial_data.get("ebitdaMargins")),
"return_on_assets": extract_raw_value(financial_data.get("returnOnAssets")),
"return_on_equity": extract_raw_value(financial_data.get("returnOnEquity")),
}
growth = {
"revenue_growth": extract_raw_value(financial_data.get("revenueGrowth")),
"earnings_growth": extract_raw_value(financial_data.get("earningsGrowth")),
"revenue_per_share": extract_raw_value(financial_data.get("revenuePerShare")),
}
financial_strength = {
"total_cash": extract_raw_value(financial_data.get("totalCash")),
"total_debt": extract_raw_value(financial_data.get("totalDebt")),
"debt_to_equity": extract_raw_value(financial_data.get("debtToEquity")),
"current_ratio": extract_raw_value(financial_data.get("currentRatio")),
"quick_ratio": extract_raw_value(financial_data.get("quickRatio")),
}
cashflow = {
"operating_cashflow": extract_raw_value(financial_data.get("operatingCashflow")),
"free_cashflow": extract_raw_value(financial_data.get("freeCashflow")),
"ebitda": extract_raw_value(financial_data.get("ebitda")),
}
ownership = {
"shares_outstanding": extract_raw_value(key_stats.get("sharesOutstanding")),
"float_shares": extract_raw_value(key_stats.get("floatShares")),
"shares_short": extract_raw_value(key_stats.get("sharesShort")),
"short_ratio": extract_raw_value(key_stats.get("shortRatio")),
"short_percent_of_float": extract_raw_value(key_stats.get("shortPercentOfFloat")),
"held_percent_insiders": extract_raw_value(key_stats.get("heldPercentInsiders")),
"held_percent_institutions": extract_raw_value(
key_stats.get("heldPercentInstitutions")
),
}
analyst = {
"recommendation": {
"key": key_metrics.get("recommendation_key"),
"mean": key_metrics.get("recommendation_mean"),
"analyst_opinion_count": key_metrics.get("analyst_opinion_count"),
"target_price_high": key_metrics.get("target_price_high"),
"target_price_low": key_metrics.get("target_price_low"),
"target_price_mean": key_metrics.get("target_price_mean"),
"target_price_median": key_metrics.get("target_price_median"),
},
"trend": simplify_recommendation_trend(recommendation_trend.get("trend")),
"upgrades_downgrades": simplify_upgrade_history(
upgrade_history.get("history"), limit=20
),
"ratings_top": simplify_ratings_top(ratings_payload),
}
earnings_summary = summarize_earnings(earnings, calendar_events)
performance_summary = {
"equity_performance": summarize_performance(equity_performance),
"unadjusted_performance": summarize_performance(performance_overview),
}
matched_symbols = []
for candidate in [
price_data.get("symbol") if price_data else None,
quote.get("symbol") if quote else None,
quote_type.get("symbol") if quote_type else None,
]:
if candidate:
matched_symbols.append(candidate)
symbol_match = None
if matched_symbols:
symbol_match = any(
candidate.upper() == symbol.upper() for candidate in matched_symbols
)
issues = []
if not quote_summary:
issues.append("missing_quote_summary")
if matched_symbols and not symbol_match:
issues.append("symbol_mismatch")
if not quote:
issues.append("missing_quote_data")
if not quote_type:
issues.append("missing_quote_type")
validation = {
"requested_symbol": symbol,
"matched_symbols": matched_symbols,
"symbol_match": symbol_match,
"issues": issues,
}
if "missing_quote_summary" in issues or "symbol_mismatch" in issues:
return {
"error": "Profile validation failed",
"stock": symbol,
"url": url,
"validation": validation,
"data_sources": {
"quote_summary": quote_summary_url,
"quote": quote_url,
"quote_type": quote_type_url,
"ratings_top": ratings_url,
"recommendations": recs_url,
},
}
return {
"stock": symbol,
"url": url,
"fetched_at": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
"validation": validation,
"key_metrics": key_metrics,
"valuation": valuation,
"profitability": profitability,
"growth": growth,
"financial_strength": financial_strength,
"cashflow": cashflow,
"ownership": ownership,
"analyst": analyst,
"earnings": earnings_summary,
"performance": performance_summary,
"data_sources": {
"quote_summary": quote_summary_url,
"quote": quote_url,
"quote_type": quote_type_url,
"ratings_top": ratings_url,
"recommendations": recs_url,
},
}
def scrape_yahoo_options(symbol, expiration=None, strike_limit=25):
def parse_table(table_html, side):
if not table_html:
@@ -656,7 +1279,10 @@ def scrape_yahoo_options(symbol, expiration=None, strike_limit=25):
@app.route("/scrape_sync")
def scrape_sync():
symbol = request.args.get("stock", "MSFT")
symbol = request.args.get("stock")
if not symbol:
return jsonify({"error": "Missing 'stock' parameter"}), 400
expiration = (
request.args.get("expiration")
or request.args.get("expiry")
@@ -672,5 +1298,173 @@ def scrape_sync():
return jsonify(scrape_yahoo_options(symbol, expiration, strike_limit))
@app.route("/profile")
def profile():
symbol = request.args.get("stock")
if not symbol:
return jsonify({"error": "Missing 'stock' parameter"}), 400
app.logger.info("Received /profile request for symbol=%s", symbol)
return jsonify(scrape_yahoo_profile(symbol))
def scrape_truths_sync(count=10, handle="realDonaldTrump"):
app.logger.info("Starting Truth Social scrape for handle=%s count=%d", handle, count)
with sync_playwright() as p:
launch_args = chromium_launch_args()
if launch_args:
app.logger.info("GPU acceleration enabled for Truth Social")
browser = p.chromium.launch(headless=True, args=launch_args)
context = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
viewport={'width': 1920, 'height': 2000}
)
page = context.new_page()
try:
url = f"https://truthsocial.com/@{handle}"
app.logger.info("Navigating to %s", url)
page.goto(url, wait_until="domcontentloaded", timeout=60000)
# Wait for content to load
page.wait_for_timeout(5000)
# Scale down to fit more content
page.evaluate("document.body.style.zoom = '0.7'")
page.wait_for_timeout(2000)
# Handle potential modal/ad overlay
try:
close_btn = page.query_selector('[data-testid="close-modal"]')
if close_btn:
close_btn.click()
page.wait_for_timeout(1000)
page.keyboard.press("Escape")
except Exception:
pass
# Wait for any status to appear
selector = '[data-testid="status"]'
try:
page.wait_for_selector(selector, timeout=20000)
except Exception:
selector = '[data-id]'
page.wait_for_selector(selector, timeout=10000)
truths_data = []
seen_ids = set()
# Since virtual lists only render what is near the scroll position,
# we need a few small scrolls even with a tall viewport.
for scroll_step in range(10):
# Get all current statuses
statuses = page.query_selector_all(selector)
for status in statuses:
if len(truths_data) >= count:
break
try:
# Find post ID and validate it belongs to the target handle
links = status.query_selector_all('a')
post_id = None
for link in links:
href = link.get_attribute('href')
if href and f"/@{handle}/posts/" in href:
post_id = href
break
if not post_id or post_id in seen_ids:
continue
seen_ids.add(post_id)
# Content
content_el = status.query_selector('[data-testid="status-content"]')
if not content_el:
content_el = status.query_selector('[data-testid="markup"]')
content_text = content_el.inner_text() if content_el else ""
# Time
time_el = status.query_selector('time')
time_text = time_el.get_attribute('title') if time_el else ""
if not time_text and time_el:
time_text = time_el.inner_text()
# Counts
def get_btn_text(btn_selector):
btn = status.query_selector(btn_selector)
return btn.inner_text() if btn else "0"
reply_count = get_btn_text('button[aria-label="Reply"]')
retruth_count = get_btn_text('button[aria-label="ReTruth"]')
like_count = get_btn_text('button[aria-label="Like"]')
# Media
media_urls = []
imgs = status.query_selector_all('img')
for img in imgs:
alt = img.get_attribute('alt')
if alt in ["Avatar", "Profile header", "Logo", "Verified Account"]:
continue
src = img.get_attribute('src')
if src and ("static-assets" in src or "proxy" in src):
media_urls.append(src)
videos = status.query_selector_all('video')
for video in videos:
src = video.get_attribute('src')
if not src:
source_tag = video.query_selector('source')
if source_tag: src = source_tag.get_attribute('src')
if src: media_urls.append(src)
def clean(c):
return str(c).strip().replace('\n', '')
truths_data.append({
"id": post_id,
"content": content_text,
"time": time_text,
"likes_count": clean(like_count),
"comments_count": clean(reply_count),
"retruths_count": clean(retruth_count),
"media": list(set(media_urls))
})
except Exception:
continue
if len(truths_data) >= count:
break
# Scroll a bit to trigger next items
page.evaluate("window.scrollBy(0, 500)")
page.wait_for_timeout(1000)
app.logger.info("Scraped %d truths", len(truths_data))
return truths_data[:count]
except Exception as e:
app.logger.error("Truths scraper error: %s", e)
return {"error": str(e)}
finally:
browser.close()
@app.route("/truths")
def truths():
try:
count = int(request.args.get("count", 10))
except ValueError:
count = 10
handle = request.args.get("handle", "realDonaldTrump")
app.logger.info("Received /truths request for handle=%s count=%d", handle, count)
return jsonify(scrape_truths_sync(count, handle))
if __name__ == "__main__":
app.run(host="0.0.0.0", port=9777)

View File

@@ -0,0 +1,145 @@
import argparse
import json
import sys
import time
import urllib.parse
import urllib.request
DEFAULT_SYMBOLS = ["AAPL", "AMZN", "MSFT", "TSLA"]
REQUIRED_SECTIONS = [
"key_metrics",
"valuation",
"profitability",
"growth",
"financial_strength",
"cashflow",
"ownership",
"analyst",
"earnings",
"performance",
]
REQUIRED_KEY_METRICS = [
"previous_close",
"open",
"bid",
"ask",
"beta",
"eps_trailing",
"dividend_rate",
"current_price",
]
def http_get(base_url, params, timeout):
query = urllib.parse.urlencode(params)
url = f"{base_url}?{query}"
with urllib.request.urlopen(url, timeout=timeout) as resp:
return json.loads(resp.read().decode("utf-8"))
def parse_list(value, default):
if not value:
return default
return [item.strip() for item in value.split(",") if item.strip()]
def build_signature(data):
return {
"key_metrics_keys": sorted(data.get("key_metrics", {}).keys()),
"valuation_keys": sorted(data.get("valuation", {}).keys()),
"profitability_keys": sorted(data.get("profitability", {}).keys()),
"growth_keys": sorted(data.get("growth", {}).keys()),
"financial_strength_keys": sorted(data.get("financial_strength", {}).keys()),
"cashflow_keys": sorted(data.get("cashflow", {}).keys()),
"ownership_keys": sorted(data.get("ownership", {}).keys()),
"analyst_keys": sorted(data.get("analyst", {}).keys()),
"earnings_keys": sorted(data.get("earnings", {}).keys()),
"performance_keys": sorted(data.get("performance", {}).keys()),
}
def validate_payload(symbol, data):
if "error" in data:
return f"API error for {symbol}: {data}"
if data.get("stock", "").upper() != symbol.upper():
return f"Symbol mismatch: expected {symbol} got {data.get('stock')}"
validation = data.get("validation", {})
if validation.get("symbol_match") is not True:
return f"Validation symbol_match failed for {symbol}: {validation}"
if validation.get("issues"):
return f"Validation issues for {symbol}: {validation}"
for section in REQUIRED_SECTIONS:
if section not in data:
return f"Missing section {section} for {symbol}"
key_metrics = data.get("key_metrics", {})
for field in REQUIRED_KEY_METRICS:
if field not in key_metrics:
return f"Missing key metric {field} for {symbol}"
return None
def main():
parser = argparse.ArgumentParser(description="Yahoo profile scraper test cycles")
parser.add_argument(
"--base-url",
default="http://127.0.0.1:9777/profile",
help="Base URL for /profile",
)
parser.add_argument(
"--symbols",
default=",".join(DEFAULT_SYMBOLS),
help="Comma-separated stock symbols",
)
parser.add_argument(
"--runs",
type=int,
default=8,
help="Number of validation runs per symbol",
)
parser.add_argument(
"--timeout",
type=int,
default=180,
help="Request timeout in seconds",
)
parser.add_argument(
"--sleep",
type=float,
default=0.2,
help="Sleep between requests",
)
args = parser.parse_args()
symbols = parse_list(args.symbols, DEFAULT_SYMBOLS)
signatures = {}
print(f"Running {args.runs} profile cycles for: {', '.join(symbols)}")
for run in range(1, args.runs + 1):
print(f"Cycle {run}/{args.runs}")
for symbol in symbols:
data = http_get(args.base_url, {"stock": symbol}, args.timeout)
error = validate_payload(symbol, data)
if error:
print(f"ERROR: {error}")
sys.exit(1)
signature = build_signature(data)
if symbol not in signatures:
signatures[symbol] = signature
elif signatures[symbol] != signature:
print(f"ERROR: Signature changed for {symbol}")
print(f"Baseline: {signatures[symbol]}")
print(f"Current: {signature}")
sys.exit(1)
time.sleep(args.sleep)
print(f"Cycle {run} OK")
print("\nAll profile cycles completed successfully.")
if __name__ == "__main__":
main()