Add profile endpoint and validation
This commit is contained in:
25
AGENTS.md
25
AGENTS.md
@@ -16,6 +16,14 @@
|
|||||||
- If `strikeLimit` is greater than available strikes, all available rows are returned.
|
- If `strikeLimit` is greater than available strikes, all available rows are returned.
|
||||||
- `pruned_calls_count` and `pruned_puts_count` report how many rows were removed beyond the limit.
|
- `pruned_calls_count` and `pruned_puts_count` report how many rows were removed beyond the limit.
|
||||||
- `selected_expiration` reports the resolved expiry (epoch + label), and mismatches return an error.
|
- `selected_expiration` reports the resolved expiry (epoch + label), and mismatches return an error.
|
||||||
|
- Route: `GET /profile`
|
||||||
|
- Query params:
|
||||||
|
- `stock`: symbol (default `MSFT`).
|
||||||
|
- Behavior:
|
||||||
|
- Loads `https://finance.yahoo.com/quote/<SYMBOL>/` with Playwright.
|
||||||
|
- Pulls the embedded SvelteKit payloads (quoteSummary, quote, quoteType, ratings, recommendations).
|
||||||
|
- Parses rendered DOM for recent news and the ticker news summary.
|
||||||
|
- Returns company profile, key stats, earnings, analyst data, performance, and news in JSON.
|
||||||
|
|
||||||
## Guard Rails
|
## Guard Rails
|
||||||
- Run local 10-cycle validation (4 stocks x 4 expiries) before any deploy or push.
|
- Run local 10-cycle validation (4 stocks x 4 expiries) before any deploy or push.
|
||||||
@@ -28,9 +36,26 @@
|
|||||||
- Local server:
|
- Local server:
|
||||||
- Start: `.\venv\Scripts\python.exe scraper_service.py`
|
- Start: `.\venv\Scripts\python.exe scraper_service.py`
|
||||||
- Validate: `python scripts/test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
|
- Validate: `python scripts/test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
|
||||||
|
- Profile validation (local server):
|
||||||
|
- Validate: `python scripts/test_profile_cycles.py --base-url http://127.0.0.1:9777/profile --runs 8`
|
||||||
- Docker server:
|
- Docker server:
|
||||||
- Start: `docker run --rm -p 9777:9777 rushabhtechie/yahoo-scraper:latest`
|
- Start: `docker run --rm -p 9777:9777 rushabhtechie/yahoo-scraper:latest`
|
||||||
- Validate: `python scripts/test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
|
- Validate: `python scripts/test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
|
||||||
|
- Profile validation (docker server):
|
||||||
|
- Validate: `python scripts/test_profile_cycles.py --base-url http://127.0.0.1:9777/profile --runs 8`
|
||||||
|
|
||||||
|
## Update Log (2025-12-28)
|
||||||
|
- Added `/profile` endpoint backed by SvelteKit payload parsing (quoteSummary, quote, quoteType, ratings, recommendations) plus rendered news extraction.
|
||||||
|
- Response now includes company profile, key stats, earnings, analyst trends, performance overview, ticker news summary, and recent news items.
|
||||||
|
- Validation added to ensure quote data matches the requested symbol, with issues reported in `validation`.
|
||||||
|
- Issue encountered: existing server instance bound to port 9777 without `/profile`, resolved by restarting the service with the updated script.
|
||||||
|
- Tests executed (local):
|
||||||
|
- `.\venv\Scripts\python.exe scripts\test_profile_cycles.py --runs 8 --timeout 180`
|
||||||
|
- `.\venv\Scripts\python.exe scripts\test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
|
||||||
|
- Tests executed (docker):
|
||||||
|
- `docker build -t rushabhtechie/yahoo-scraper:latest .`
|
||||||
|
- `.\venv\Scripts\python.exe scripts\test_cycles.py --base-url http://127.0.0.1:9777/scrape_sync`
|
||||||
|
- `.\venv\Scripts\python.exe scripts\test_profile_cycles.py --base-url http://127.0.0.1:9777/profile --runs 8 --timeout 180`
|
||||||
- The test harness verifies:
|
- The test harness verifies:
|
||||||
- Requested expiration matches `selected_expiration.value`.
|
- Requested expiration matches `selected_expiration.value`.
|
||||||
- Contract symbols include the expected YYMMDD code.
|
- Contract symbols include the expected YYMMDD code.
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ from flask import Flask, jsonify, request
|
|||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
import html
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
@@ -353,6 +354,415 @@ def parse_strike_limit(value, default=25):
|
|||||||
return limit if limit > 0 else default
|
return limit if limit > 0 else default
|
||||||
|
|
||||||
|
|
||||||
|
def parse_sveltekit_payload(raw_text):
|
||||||
|
if not raw_text:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
outer = json.loads(raw_text)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return None
|
||||||
|
body = outer.get("body")
|
||||||
|
if isinstance(body, str):
|
||||||
|
try:
|
||||||
|
body = json.loads(body)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
return {
|
||||||
|
"status": outer.get("status"),
|
||||||
|
"statusText": outer.get("statusText"),
|
||||||
|
"body": body,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_sveltekit_payloads_from_soup(soup):
|
||||||
|
payloads = {}
|
||||||
|
if soup is None:
|
||||||
|
return payloads
|
||||||
|
scripts = soup.select('script[type="application/json"][data-sveltekit-fetched]')
|
||||||
|
for script in scripts:
|
||||||
|
url = script.get("data-url")
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
url = html.unescape(url)
|
||||||
|
raw_text = script.string or script.get_text()
|
||||||
|
payload = parse_sveltekit_payload(raw_text)
|
||||||
|
if not payload:
|
||||||
|
continue
|
||||||
|
payloads[url] = payload
|
||||||
|
return payloads
|
||||||
|
|
||||||
|
|
||||||
|
def select_payload(payloads, needle, symbol=None):
|
||||||
|
if not payloads:
|
||||||
|
return None, None
|
||||||
|
needle = needle.lower()
|
||||||
|
symbol_token = symbol.lower() if symbol else None
|
||||||
|
fallback = None
|
||||||
|
for url, payload in payloads.items():
|
||||||
|
url_lower = url.lower()
|
||||||
|
if needle not in url_lower:
|
||||||
|
continue
|
||||||
|
if symbol_token:
|
||||||
|
if f"/{symbol_token}" in url_lower or f"symbols={symbol_token}" in url_lower:
|
||||||
|
return url, payload.get("body")
|
||||||
|
if fallback is None:
|
||||||
|
fallback = (url, payload.get("body"))
|
||||||
|
return fallback if fallback else (None, None)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_quote_summary(payload):
|
||||||
|
if not payload:
|
||||||
|
return None
|
||||||
|
summary = payload.get("quoteSummary")
|
||||||
|
if not summary:
|
||||||
|
return None
|
||||||
|
result = summary.get("result") or []
|
||||||
|
return result[0] if result else None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_quote_response(payload):
|
||||||
|
if not payload:
|
||||||
|
return None
|
||||||
|
response = payload.get("quoteResponse")
|
||||||
|
if not response:
|
||||||
|
return None
|
||||||
|
result = response.get("result") or []
|
||||||
|
return result[0] if result else None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_quote_type(payload):
|
||||||
|
if not payload:
|
||||||
|
return None
|
||||||
|
quote_type = payload.get("quoteType")
|
||||||
|
if not quote_type:
|
||||||
|
return None
|
||||||
|
result = quote_type.get("result") or []
|
||||||
|
return result[0] if result else None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_recent_news_from_soup(soup, limit=20):
|
||||||
|
items = []
|
||||||
|
if soup is None:
|
||||||
|
return items
|
||||||
|
container = soup.select_one('[data-testid="recent-news"]')
|
||||||
|
root = container if container else soup
|
||||||
|
seen = set()
|
||||||
|
for item in root.select('[data-testid="storyitem"]'):
|
||||||
|
title_el = item.select_one("h3")
|
||||||
|
link_el = item.select_one("a[href]")
|
||||||
|
if not title_el and not link_el:
|
||||||
|
continue
|
||||||
|
title = title_el.get_text(strip=True) if title_el else None
|
||||||
|
link = link_el.get("href") if link_el else None
|
||||||
|
publisher = None
|
||||||
|
published = None
|
||||||
|
publishing = item.select_one(".publishing")
|
||||||
|
if publishing:
|
||||||
|
text = " ".join(publishing.stripped_strings)
|
||||||
|
if "\u2022" in text:
|
||||||
|
parts = [part.strip() for part in text.split("\u2022", 1)]
|
||||||
|
publisher = parts[0] or None
|
||||||
|
published = parts[1] if len(parts) > 1 else None
|
||||||
|
else:
|
||||||
|
publisher = text or None
|
||||||
|
key = link or title
|
||||||
|
if key and key in seen:
|
||||||
|
continue
|
||||||
|
if key:
|
||||||
|
seen.add(key)
|
||||||
|
items.append(
|
||||||
|
{
|
||||||
|
"title": title,
|
||||||
|
"publisher": publisher,
|
||||||
|
"published": published,
|
||||||
|
"link": link,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if limit and len(items) >= limit:
|
||||||
|
break
|
||||||
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
def extract_news_summary_from_soup(soup):
|
||||||
|
if soup is None:
|
||||||
|
return None
|
||||||
|
summary = soup.select_one('[data-testid="ticker-news-summary"]')
|
||||||
|
if not summary:
|
||||||
|
return None
|
||||||
|
text = " ".join(summary.stripped_strings)
|
||||||
|
return text if text else None
|
||||||
|
|
||||||
|
|
||||||
|
def build_profile_key_metrics(summary_detail, key_stats, financial_data, price_data, quote):
|
||||||
|
summary_detail = summary_detail or {}
|
||||||
|
key_stats = key_stats or {}
|
||||||
|
financial_data = financial_data or {}
|
||||||
|
price_data = price_data or {}
|
||||||
|
quote = quote or {}
|
||||||
|
|
||||||
|
def pick_value(*values):
|
||||||
|
for value in values:
|
||||||
|
if value is not None:
|
||||||
|
return value
|
||||||
|
return None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"previous_close": extract_raw_value(summary_detail.get("previousClose")),
|
||||||
|
"open": extract_raw_value(summary_detail.get("open")),
|
||||||
|
"bid": extract_raw_value(summary_detail.get("bid")),
|
||||||
|
"ask": extract_raw_value(summary_detail.get("ask")),
|
||||||
|
"bid_size": extract_raw_value(summary_detail.get("bidSize")),
|
||||||
|
"ask_size": extract_raw_value(summary_detail.get("askSize")),
|
||||||
|
"day_low": extract_raw_value(summary_detail.get("dayLow")),
|
||||||
|
"day_high": extract_raw_value(summary_detail.get("dayHigh")),
|
||||||
|
"fifty_two_week_low": extract_raw_value(quote.get("fiftyTwoWeekLow")),
|
||||||
|
"fifty_two_week_high": extract_raw_value(quote.get("fiftyTwoWeekHigh")),
|
||||||
|
"volume": pick_value(
|
||||||
|
extract_raw_value(summary_detail.get("volume")),
|
||||||
|
extract_raw_value(price_data.get("regularMarketVolume")),
|
||||||
|
extract_raw_value(quote.get("regularMarketVolume")),
|
||||||
|
),
|
||||||
|
"average_volume": pick_value(
|
||||||
|
extract_raw_value(summary_detail.get("averageVolume")),
|
||||||
|
extract_raw_value(price_data.get("averageDailyVolume3Month")),
|
||||||
|
),
|
||||||
|
"market_cap": pick_value(
|
||||||
|
extract_raw_value(summary_detail.get("marketCap")),
|
||||||
|
extract_raw_value(quote.get("marketCap")),
|
||||||
|
),
|
||||||
|
"beta": pick_value(
|
||||||
|
extract_raw_value(summary_detail.get("beta")),
|
||||||
|
extract_raw_value(key_stats.get("beta")),
|
||||||
|
),
|
||||||
|
"trailing_pe": pick_value(
|
||||||
|
extract_raw_value(summary_detail.get("trailingPE")),
|
||||||
|
extract_raw_value(key_stats.get("trailingPE")),
|
||||||
|
),
|
||||||
|
"forward_pe": pick_value(
|
||||||
|
extract_raw_value(summary_detail.get("forwardPE")),
|
||||||
|
extract_raw_value(key_stats.get("forwardPE")),
|
||||||
|
),
|
||||||
|
"eps_trailing": extract_raw_value(key_stats.get("trailingEps")),
|
||||||
|
"eps_forward": extract_raw_value(key_stats.get("forwardEps")),
|
||||||
|
"dividend_rate": extract_raw_value(summary_detail.get("dividendRate")),
|
||||||
|
"dividend_yield": extract_raw_value(summary_detail.get("dividendYield")),
|
||||||
|
"ex_dividend_date": extract_raw_value(summary_detail.get("exDividendDate")),
|
||||||
|
"payout_ratio": extract_raw_value(summary_detail.get("payoutRatio")),
|
||||||
|
"current_price": pick_value(
|
||||||
|
extract_raw_value(price_data.get("regularMarketPrice")),
|
||||||
|
extract_raw_value(financial_data.get("currentPrice")),
|
||||||
|
extract_raw_value(quote.get("regularMarketPrice")),
|
||||||
|
),
|
||||||
|
"recommendation_key": financial_data.get("recommendationKey"),
|
||||||
|
"recommendation_mean": extract_raw_value(financial_data.get("recommendationMean")),
|
||||||
|
"target_price_high": extract_raw_value(financial_data.get("targetHighPrice")),
|
||||||
|
"target_price_low": extract_raw_value(financial_data.get("targetLowPrice")),
|
||||||
|
"target_price_mean": extract_raw_value(financial_data.get("targetMeanPrice")),
|
||||||
|
"target_price_median": extract_raw_value(financial_data.get("targetMedianPrice")),
|
||||||
|
"analyst_opinion_count": extract_raw_value(
|
||||||
|
financial_data.get("numberOfAnalystOpinions")
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_yahoo_profile(symbol):
|
||||||
|
encoded = urllib.parse.quote(symbol, safe="")
|
||||||
|
url = f"https://finance.yahoo.com/quote/{encoded}/"
|
||||||
|
app.logger.info("Starting profile scrape for symbol=%s url=%s", symbol, url)
|
||||||
|
|
||||||
|
response_html = None
|
||||||
|
rendered_html = None
|
||||||
|
payloads = {}
|
||||||
|
news = []
|
||||||
|
news_summary = None
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
launch_args = chromium_launch_args()
|
||||||
|
if launch_args:
|
||||||
|
app.logger.info("GPU acceleration enabled")
|
||||||
|
else:
|
||||||
|
app.logger.info("GPU acceleration disabled")
|
||||||
|
browser = p.chromium.launch(headless=True, args=launch_args)
|
||||||
|
page = browser.new_page()
|
||||||
|
page.set_extra_http_headers(
|
||||||
|
{
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
page.set_default_timeout(60000)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||||
|
app.logger.info("Profile page loaded (domcontentloaded) for %s", symbol)
|
||||||
|
if response:
|
||||||
|
response_html = response.text()
|
||||||
|
else:
|
||||||
|
app.logger.warning("No response body for profile page %s", symbol)
|
||||||
|
|
||||||
|
try:
|
||||||
|
page.wait_for_selector(
|
||||||
|
'[data-testid="recent-news"], [data-testid="ticker-news-summary"]',
|
||||||
|
timeout=15000,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
app.logger.warning("News content not detected for %s: %s", symbol, exc)
|
||||||
|
|
||||||
|
page.wait_for_timeout(2000)
|
||||||
|
rendered_html = page.content()
|
||||||
|
finally:
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
if not response_html and not rendered_html:
|
||||||
|
return {"error": "Profile page content missing", "stock": symbol, "url": url}
|
||||||
|
|
||||||
|
payload_source = response_html or rendered_html
|
||||||
|
payload_soup = BeautifulSoup(payload_source, "html.parser") if payload_source else None
|
||||||
|
payloads = extract_sveltekit_payloads_from_soup(payload_soup)
|
||||||
|
if not payloads and rendered_html and rendered_html != payload_source:
|
||||||
|
fallback_soup = BeautifulSoup(rendered_html, "html.parser")
|
||||||
|
payloads = extract_sveltekit_payloads_from_soup(fallback_soup)
|
||||||
|
|
||||||
|
if rendered_html:
|
||||||
|
news_soup = BeautifulSoup(rendered_html, "html.parser")
|
||||||
|
news = extract_recent_news_from_soup(news_soup, limit=20)
|
||||||
|
news_summary = extract_news_summary_from_soup(news_soup)
|
||||||
|
|
||||||
|
if not payloads:
|
||||||
|
return {
|
||||||
|
"error": "No embedded payloads found on profile page",
|
||||||
|
"stock": symbol,
|
||||||
|
"url": url,
|
||||||
|
}
|
||||||
|
|
||||||
|
quote_summary_url, quote_summary_payload = select_payload(
|
||||||
|
payloads, "quoteSummary", symbol
|
||||||
|
)
|
||||||
|
quote_url, quote_payload = select_payload(payloads, "v7/finance/quote?", symbol)
|
||||||
|
quote_type_url, quote_type_payload = select_payload(
|
||||||
|
payloads, "/v1/finance/quoteType/", symbol
|
||||||
|
)
|
||||||
|
ratings_url, ratings_payload = select_payload(payloads, "ratings/top", symbol)
|
||||||
|
recs_url, recs_payload = select_payload(
|
||||||
|
payloads, "recommendationsbysymbol", symbol
|
||||||
|
)
|
||||||
|
|
||||||
|
quote_summary = extract_quote_summary(quote_summary_payload)
|
||||||
|
quote = extract_quote_response(quote_payload)
|
||||||
|
quote_type = extract_quote_type(quote_type_payload)
|
||||||
|
|
||||||
|
summary_profile = quote_summary.get("summaryProfile", {}) if quote_summary else {}
|
||||||
|
summary_detail = quote_summary.get("summaryDetail", {}) if quote_summary else {}
|
||||||
|
key_stats = quote_summary.get("defaultKeyStatistics", {}) if quote_summary else {}
|
||||||
|
financial_data = quote_summary.get("financialData", {}) if quote_summary else {}
|
||||||
|
price_data = quote_summary.get("price", {}) if quote_summary else {}
|
||||||
|
recommendation_trend = (
|
||||||
|
quote_summary.get("recommendationTrend", {}) if quote_summary else {}
|
||||||
|
)
|
||||||
|
upgrade_history = (
|
||||||
|
quote_summary.get("upgradeDowngradeHistory", {}) if quote_summary else {}
|
||||||
|
)
|
||||||
|
earnings = quote_summary.get("earnings", {}) if quote_summary else {}
|
||||||
|
earnings_gaap = quote_summary.get("earningsGaap", {}) if quote_summary else {}
|
||||||
|
earnings_non_gaap = quote_summary.get("earningsNonGaap", {}) if quote_summary else {}
|
||||||
|
calendar_events = quote_summary.get("calendarEvents", {}) if quote_summary else {}
|
||||||
|
equity_performance = (
|
||||||
|
quote_summary.get("equityPerformance", {}) if quote_summary else {}
|
||||||
|
)
|
||||||
|
performance_overview = (
|
||||||
|
quote_summary.get("quoteUnadjustedPerformanceOverview", {})
|
||||||
|
if quote_summary
|
||||||
|
else {}
|
||||||
|
)
|
||||||
|
|
||||||
|
key_metrics = build_profile_key_metrics(
|
||||||
|
summary_detail, key_stats, financial_data, price_data, quote
|
||||||
|
)
|
||||||
|
|
||||||
|
matched_symbols = []
|
||||||
|
for candidate in [
|
||||||
|
price_data.get("symbol") if price_data else None,
|
||||||
|
quote.get("symbol") if quote else None,
|
||||||
|
quote_type.get("symbol") if quote_type else None,
|
||||||
|
]:
|
||||||
|
if candidate:
|
||||||
|
matched_symbols.append(candidate)
|
||||||
|
|
||||||
|
symbol_match = None
|
||||||
|
if matched_symbols:
|
||||||
|
symbol_match = any(
|
||||||
|
candidate.upper() == symbol.upper() for candidate in matched_symbols
|
||||||
|
)
|
||||||
|
|
||||||
|
issues = []
|
||||||
|
if not quote_summary:
|
||||||
|
issues.append("missing_quote_summary")
|
||||||
|
if matched_symbols and not symbol_match:
|
||||||
|
issues.append("symbol_mismatch")
|
||||||
|
if not quote:
|
||||||
|
issues.append("missing_quote_data")
|
||||||
|
if not quote_type:
|
||||||
|
issues.append("missing_quote_type")
|
||||||
|
|
||||||
|
validation = {
|
||||||
|
"requested_symbol": symbol,
|
||||||
|
"matched_symbols": matched_symbols,
|
||||||
|
"symbol_match": symbol_match,
|
||||||
|
"issues": issues,
|
||||||
|
}
|
||||||
|
|
||||||
|
if "missing_quote_summary" in issues or "symbol_mismatch" in issues:
|
||||||
|
return {
|
||||||
|
"error": "Profile validation failed",
|
||||||
|
"stock": symbol,
|
||||||
|
"url": url,
|
||||||
|
"validation": validation,
|
||||||
|
"data_sources": {
|
||||||
|
"quote_summary": quote_summary_url,
|
||||||
|
"quote": quote_url,
|
||||||
|
"quote_type": quote_type_url,
|
||||||
|
"ratings_top": ratings_url,
|
||||||
|
"recommendations": recs_url,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"stock": symbol,
|
||||||
|
"url": url,
|
||||||
|
"fetched_at": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||||
|
"validation": validation,
|
||||||
|
"company_profile": summary_profile,
|
||||||
|
"summary_detail": summary_detail,
|
||||||
|
"default_key_statistics": key_stats,
|
||||||
|
"financial_data": financial_data,
|
||||||
|
"price": price_data,
|
||||||
|
"earnings": earnings,
|
||||||
|
"earnings_gaap": earnings_gaap,
|
||||||
|
"earnings_non_gaap": earnings_non_gaap,
|
||||||
|
"calendar_events": calendar_events,
|
||||||
|
"equity_performance": equity_performance,
|
||||||
|
"performance_overview": performance_overview,
|
||||||
|
"recommendation_trend": recommendation_trend,
|
||||||
|
"upgrade_downgrade_history": upgrade_history,
|
||||||
|
"key_metrics": key_metrics,
|
||||||
|
"quote": quote,
|
||||||
|
"quote_type": quote_type,
|
||||||
|
"recommendations_by_symbol": recs_payload,
|
||||||
|
"ratings_top": ratings_payload,
|
||||||
|
"news_summary": news_summary,
|
||||||
|
"recent_news": news,
|
||||||
|
"data_sources": {
|
||||||
|
"quote_summary": quote_summary_url,
|
||||||
|
"quote": quote_url,
|
||||||
|
"quote_type": quote_type_url,
|
||||||
|
"ratings_top": ratings_url,
|
||||||
|
"recommendations": recs_url,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def scrape_yahoo_options(symbol, expiration=None, strike_limit=25):
|
def scrape_yahoo_options(symbol, expiration=None, strike_limit=25):
|
||||||
def parse_table(table_html, side):
|
def parse_table(table_html, side):
|
||||||
if not table_html:
|
if not table_html:
|
||||||
@@ -672,5 +1082,12 @@ def scrape_sync():
|
|||||||
return jsonify(scrape_yahoo_options(symbol, expiration, strike_limit))
|
return jsonify(scrape_yahoo_options(symbol, expiration, strike_limit))
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/profile")
|
||||||
|
def profile():
|
||||||
|
symbol = request.args.get("stock", "MSFT")
|
||||||
|
app.logger.info("Received /profile request for symbol=%s", symbol)
|
||||||
|
return jsonify(scrape_yahoo_profile(symbol))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app.run(host="0.0.0.0", port=9777)
|
app.run(host="0.0.0.0", port=9777)
|
||||||
|
|||||||
158
scripts/test_profile_cycles.py
Normal file
158
scripts/test_profile_cycles.py
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import urllib.parse
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
DEFAULT_SYMBOLS = ["AAPL", "AMZN", "MSFT", "TSLA"]
|
||||||
|
|
||||||
|
REQUIRED_SECTIONS = [
|
||||||
|
"company_profile",
|
||||||
|
"summary_detail",
|
||||||
|
"default_key_statistics",
|
||||||
|
"financial_data",
|
||||||
|
"price",
|
||||||
|
"key_metrics",
|
||||||
|
"recommendation_trend",
|
||||||
|
"upgrade_downgrade_history",
|
||||||
|
"earnings",
|
||||||
|
"calendar_events",
|
||||||
|
"equity_performance",
|
||||||
|
"performance_overview",
|
||||||
|
"quote",
|
||||||
|
"quote_type",
|
||||||
|
"recent_news",
|
||||||
|
]
|
||||||
|
|
||||||
|
REQUIRED_COMPANY_FIELDS = ["longBusinessSummary", "industry", "sector"]
|
||||||
|
REQUIRED_KEY_METRICS = [
|
||||||
|
"previous_close",
|
||||||
|
"open",
|
||||||
|
"bid",
|
||||||
|
"ask",
|
||||||
|
"beta",
|
||||||
|
"eps_trailing",
|
||||||
|
"dividend_rate",
|
||||||
|
"current_price",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def http_get(base_url, params, timeout):
|
||||||
|
query = urllib.parse.urlencode(params)
|
||||||
|
url = f"{base_url}?{query}"
|
||||||
|
with urllib.request.urlopen(url, timeout=timeout) as resp:
|
||||||
|
return json.loads(resp.read().decode("utf-8"))
|
||||||
|
|
||||||
|
|
||||||
|
def parse_list(value, default):
|
||||||
|
if not value:
|
||||||
|
return default
|
||||||
|
return [item.strip() for item in value.split(",") if item.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def build_signature(data):
|
||||||
|
return {
|
||||||
|
"company_profile_keys": sorted(data.get("company_profile", {}).keys()),
|
||||||
|
"summary_detail_keys": sorted(data.get("summary_detail", {}).keys()),
|
||||||
|
"default_key_statistics_keys": sorted(
|
||||||
|
data.get("default_key_statistics", {}).keys()
|
||||||
|
),
|
||||||
|
"financial_data_keys": sorted(data.get("financial_data", {}).keys()),
|
||||||
|
"price_keys": sorted(data.get("price", {}).keys()),
|
||||||
|
"key_metrics_keys": sorted(data.get("key_metrics", {}).keys()),
|
||||||
|
"data_sources_keys": sorted(data.get("data_sources", {}).keys()),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def validate_payload(symbol, data):
|
||||||
|
if "error" in data:
|
||||||
|
return f"API error for {symbol}: {data}"
|
||||||
|
if data.get("stock", "").upper() != symbol.upper():
|
||||||
|
return f"Symbol mismatch: expected {symbol} got {data.get('stock')}"
|
||||||
|
validation = data.get("validation", {})
|
||||||
|
if validation.get("symbol_match") is not True:
|
||||||
|
return f"Validation symbol_match failed for {symbol}: {validation}"
|
||||||
|
if validation.get("issues"):
|
||||||
|
return f"Validation issues for {symbol}: {validation}"
|
||||||
|
|
||||||
|
for section in REQUIRED_SECTIONS:
|
||||||
|
if section not in data:
|
||||||
|
return f"Missing section {section} for {symbol}"
|
||||||
|
|
||||||
|
company_profile = data.get("company_profile", {})
|
||||||
|
for field in REQUIRED_COMPANY_FIELDS:
|
||||||
|
if field not in company_profile:
|
||||||
|
return f"Missing company field {field} for {symbol}"
|
||||||
|
|
||||||
|
key_metrics = data.get("key_metrics", {})
|
||||||
|
for field in REQUIRED_KEY_METRICS:
|
||||||
|
if field not in key_metrics:
|
||||||
|
return f"Missing key metric {field} for {symbol}"
|
||||||
|
|
||||||
|
if not data.get("news_summary") and not data.get("recent_news"):
|
||||||
|
return f"Missing news summary and recent news for {symbol}"
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Yahoo profile scraper test cycles")
|
||||||
|
parser.add_argument(
|
||||||
|
"--base-url",
|
||||||
|
default="http://127.0.0.1:9777/profile",
|
||||||
|
help="Base URL for /profile",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--symbols",
|
||||||
|
default=",".join(DEFAULT_SYMBOLS),
|
||||||
|
help="Comma-separated stock symbols",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--runs",
|
||||||
|
type=int,
|
||||||
|
default=8,
|
||||||
|
help="Number of validation runs per symbol",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--timeout",
|
||||||
|
type=int,
|
||||||
|
default=180,
|
||||||
|
help="Request timeout in seconds",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sleep",
|
||||||
|
type=float,
|
||||||
|
default=0.2,
|
||||||
|
help="Sleep between requests",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
symbols = parse_list(args.symbols, DEFAULT_SYMBOLS)
|
||||||
|
signatures = {}
|
||||||
|
|
||||||
|
print(f"Running {args.runs} profile cycles for: {', '.join(symbols)}")
|
||||||
|
for run in range(1, args.runs + 1):
|
||||||
|
print(f"Cycle {run}/{args.runs}")
|
||||||
|
for symbol in symbols:
|
||||||
|
data = http_get(args.base_url, {"stock": symbol}, args.timeout)
|
||||||
|
error = validate_payload(symbol, data)
|
||||||
|
if error:
|
||||||
|
print(f"ERROR: {error}")
|
||||||
|
sys.exit(1)
|
||||||
|
signature = build_signature(data)
|
||||||
|
if symbol not in signatures:
|
||||||
|
signatures[symbol] = signature
|
||||||
|
elif signatures[symbol] != signature:
|
||||||
|
print(f"ERROR: Signature changed for {symbol}")
|
||||||
|
print(f"Baseline: {signatures[symbol]}")
|
||||||
|
print(f"Current: {signature}")
|
||||||
|
sys.exit(1)
|
||||||
|
time.sleep(args.sleep)
|
||||||
|
print(f"Cycle {run} OK")
|
||||||
|
|
||||||
|
print("\nAll profile cycles completed successfully.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user