Compare commits
9 Commits
690887a6ec
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
83a5e843c0 | ||
|
|
4e02c6ce0a | ||
|
|
c01a98abce | ||
|
|
68805ed80a | ||
| 711d87a998 | |||
| bce40014ad | |||
| 50a7ef119a | |||
| 4ab0e22047 | |||
| 67b8fad423 |
@@ -1,4 +1,4 @@
|
||||
FROM mcr.microsoft.com/playwright/python:v1.50.0-jammy
|
||||
FROM mcr.microsoft.com/playwright/python:v1.57.0-jammy
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -6,7 +6,7 @@ ENV PYTHONUNBUFFERED=1
|
||||
|
||||
COPY scraper_service.py /app/scraper_service.py
|
||||
|
||||
RUN python -m pip install --no-cache-dir flask beautifulsoup4
|
||||
RUN python -m pip install --no-cache-dir flask beautifulsoup4 playwright==1.57.0
|
||||
|
||||
EXPOSE 9777
|
||||
|
||||
|
||||
1339
scraper_service.py
1339
scraper_service.py
File diff suppressed because it is too large
Load Diff
199
scripts/test_cycles.py
Normal file
199
scripts/test_cycles.py
Normal file
@@ -0,0 +1,199 @@
|
||||
import argparse
|
||||
import datetime
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
DEFAULT_STOCKS = ["AAPL", "AMZN", "MSFT", "TSLA"]
|
||||
DEFAULT_CYCLES = [None, 5, 10, 25, 50, 75, 100, 150, 200, 500]
|
||||
|
||||
|
||||
def http_get(base_url, params, timeout):
|
||||
query = urllib.parse.urlencode(params)
|
||||
url = f"{base_url}?{query}"
|
||||
with urllib.request.urlopen(url, timeout=timeout) as resp:
|
||||
return json.loads(resp.read().decode("utf-8"))
|
||||
|
||||
|
||||
def expected_code_from_epoch(epoch):
|
||||
return datetime.datetime.utcfromtimestamp(epoch).strftime("%y%m%d")
|
||||
|
||||
|
||||
def all_contracts_match(opts, expected_code):
|
||||
for opt in opts:
|
||||
name = opt.get("Contract Name") or ""
|
||||
if expected_code not in name:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def parse_list(value, default):
|
||||
if not value:
|
||||
return default
|
||||
return [item.strip() for item in value.split(",") if item.strip()]
|
||||
|
||||
|
||||
def parse_cycles(value):
|
||||
if not value:
|
||||
return DEFAULT_CYCLES
|
||||
cycles = []
|
||||
for item in value.split(","):
|
||||
token = item.strip().lower()
|
||||
if not token or token in ("default", "none"):
|
||||
cycles.append(None)
|
||||
continue
|
||||
try:
|
||||
cycles.append(int(token))
|
||||
except ValueError:
|
||||
raise ValueError(f"Invalid strikeLimit value: {item}")
|
||||
return cycles
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Yahoo options scraper test cycles")
|
||||
parser.add_argument(
|
||||
"--base-url",
|
||||
default="http://127.0.0.1:9777/scrape_sync",
|
||||
help="Base URL for /scrape_sync",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--stocks",
|
||||
default=",".join(DEFAULT_STOCKS),
|
||||
help="Comma-separated stock symbols",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--strike-limits",
|
||||
default="default,5,10,25,50,75,100,150,200,500",
|
||||
help="Comma-separated strike limits (use 'default' for the API default)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--baseline-limit",
|
||||
type=int,
|
||||
default=5000,
|
||||
help="Large strikeLimit used to capture all available strikes",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=180,
|
||||
help="Request timeout in seconds",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=0.2,
|
||||
help="Sleep between requests",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
stocks = parse_list(args.stocks, DEFAULT_STOCKS)
|
||||
cycles = parse_cycles(args.strike_limits)
|
||||
|
||||
print("Fetching expiration lists...")
|
||||
expirations = {}
|
||||
for stock in stocks:
|
||||
data = http_get(args.base_url, {"stock": stock, "expiration": "invalid"}, args.timeout)
|
||||
if "available_expirations" not in data:
|
||||
print(f"ERROR: missing available_expirations for {stock}: {data}")
|
||||
sys.exit(1)
|
||||
values = [opt.get("value") for opt in data["available_expirations"] if opt.get("value")]
|
||||
if len(values) < 4:
|
||||
print(f"ERROR: not enough expirations for {stock}: {values}")
|
||||
sys.exit(1)
|
||||
expirations[stock] = values[:4]
|
||||
print(f" {stock}: {expirations[stock]}")
|
||||
time.sleep(args.sleep)
|
||||
|
||||
print("\nBuilding baseline counts (strikeLimit=%d)..." % args.baseline_limit)
|
||||
baseline_counts = {}
|
||||
for stock, exp_list in expirations.items():
|
||||
for exp in exp_list:
|
||||
data = http_get(
|
||||
args.base_url,
|
||||
{"stock": stock, "expiration": exp, "strikeLimit": args.baseline_limit},
|
||||
args.timeout,
|
||||
)
|
||||
if "error" in data:
|
||||
print(f"ERROR: baseline error for {stock} {exp}: {data}")
|
||||
sys.exit(1)
|
||||
calls_count = data.get("total_calls")
|
||||
puts_count = data.get("total_puts")
|
||||
if calls_count is None or puts_count is None:
|
||||
print(f"ERROR: baseline missing counts for {stock} {exp}: {data}")
|
||||
sys.exit(1)
|
||||
expected_code = expected_code_from_epoch(exp)
|
||||
if not all_contracts_match(data.get("calls", []), expected_code):
|
||||
print(f"ERROR: baseline calls mismatch for {stock} {exp}")
|
||||
sys.exit(1)
|
||||
if not all_contracts_match(data.get("puts", []), expected_code):
|
||||
print(f"ERROR: baseline puts mismatch for {stock} {exp}")
|
||||
sys.exit(1)
|
||||
baseline_counts[(stock, exp)] = (calls_count, puts_count)
|
||||
print(f" {stock} {exp}: calls={calls_count} puts={puts_count}")
|
||||
time.sleep(args.sleep)
|
||||
|
||||
print("\nRunning %d cycles of API tests..." % len(cycles))
|
||||
for idx, strike_limit in enumerate(cycles, start=1):
|
||||
print(f"Cycle {idx}/{len(cycles)} (strikeLimit={strike_limit})")
|
||||
for stock, exp_list in expirations.items():
|
||||
for exp in exp_list:
|
||||
params = {"stock": stock, "expiration": exp}
|
||||
if strike_limit is not None:
|
||||
params["strikeLimit"] = strike_limit
|
||||
data = http_get(args.base_url, params, args.timeout)
|
||||
if "error" in data:
|
||||
print(f"ERROR: {stock} {exp} -> {data}")
|
||||
sys.exit(1)
|
||||
selected_val = data.get("selected_expiration", {}).get("value")
|
||||
if selected_val != exp:
|
||||
print(
|
||||
f"ERROR: selected expiration mismatch for {stock} {exp}: {selected_val}"
|
||||
)
|
||||
sys.exit(1)
|
||||
expected_code = expected_code_from_epoch(exp)
|
||||
if not all_contracts_match(data.get("calls", []), expected_code):
|
||||
print(f"ERROR: calls expiry mismatch for {stock} {exp}")
|
||||
sys.exit(1)
|
||||
if not all_contracts_match(data.get("puts", []), expected_code):
|
||||
print(f"ERROR: puts expiry mismatch for {stock} {exp}")
|
||||
sys.exit(1)
|
||||
available_calls, available_puts = baseline_counts[(stock, exp)]
|
||||
expected_limit = strike_limit if strike_limit is not None else 25
|
||||
expected_calls = min(expected_limit, available_calls)
|
||||
expected_puts = min(expected_limit, available_puts)
|
||||
if data.get("total_calls") != expected_calls:
|
||||
print(
|
||||
f"ERROR: call count mismatch for {stock} {exp}: "
|
||||
f"got {data.get('total_calls')} expected {expected_calls}"
|
||||
)
|
||||
sys.exit(1)
|
||||
if data.get("total_puts") != expected_puts:
|
||||
print(
|
||||
f"ERROR: put count mismatch for {stock} {exp}: "
|
||||
f"got {data.get('total_puts')} expected {expected_puts}"
|
||||
)
|
||||
sys.exit(1)
|
||||
expected_pruned_calls = max(0, available_calls - expected_calls)
|
||||
expected_pruned_puts = max(0, available_puts - expected_puts)
|
||||
if data.get("pruned_calls_count") != expected_pruned_calls:
|
||||
print(
|
||||
f"ERROR: pruned calls mismatch for {stock} {exp}: "
|
||||
f"got {data.get('pruned_calls_count')} expected {expected_pruned_calls}"
|
||||
)
|
||||
sys.exit(1)
|
||||
if data.get("pruned_puts_count") != expected_pruned_puts:
|
||||
print(
|
||||
f"ERROR: pruned puts mismatch for {stock} {exp}: "
|
||||
f"got {data.get('pruned_puts_count')} expected {expected_pruned_puts}"
|
||||
)
|
||||
sys.exit(1)
|
||||
time.sleep(args.sleep)
|
||||
print(f"Cycle {idx} OK")
|
||||
|
||||
print("\nAll cycles completed successfully.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
145
scripts/test_profile_cycles.py
Normal file
145
scripts/test_profile_cycles.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
DEFAULT_SYMBOLS = ["AAPL", "AMZN", "MSFT", "TSLA"]
|
||||
|
||||
REQUIRED_SECTIONS = [
|
||||
"key_metrics",
|
||||
"valuation",
|
||||
"profitability",
|
||||
"growth",
|
||||
"financial_strength",
|
||||
"cashflow",
|
||||
"ownership",
|
||||
"analyst",
|
||||
"earnings",
|
||||
"performance",
|
||||
]
|
||||
|
||||
REQUIRED_KEY_METRICS = [
|
||||
"previous_close",
|
||||
"open",
|
||||
"bid",
|
||||
"ask",
|
||||
"beta",
|
||||
"eps_trailing",
|
||||
"dividend_rate",
|
||||
"current_price",
|
||||
]
|
||||
|
||||
|
||||
def http_get(base_url, params, timeout):
|
||||
query = urllib.parse.urlencode(params)
|
||||
url = f"{base_url}?{query}"
|
||||
with urllib.request.urlopen(url, timeout=timeout) as resp:
|
||||
return json.loads(resp.read().decode("utf-8"))
|
||||
|
||||
|
||||
def parse_list(value, default):
|
||||
if not value:
|
||||
return default
|
||||
return [item.strip() for item in value.split(",") if item.strip()]
|
||||
|
||||
|
||||
def build_signature(data):
|
||||
return {
|
||||
"key_metrics_keys": sorted(data.get("key_metrics", {}).keys()),
|
||||
"valuation_keys": sorted(data.get("valuation", {}).keys()),
|
||||
"profitability_keys": sorted(data.get("profitability", {}).keys()),
|
||||
"growth_keys": sorted(data.get("growth", {}).keys()),
|
||||
"financial_strength_keys": sorted(data.get("financial_strength", {}).keys()),
|
||||
"cashflow_keys": sorted(data.get("cashflow", {}).keys()),
|
||||
"ownership_keys": sorted(data.get("ownership", {}).keys()),
|
||||
"analyst_keys": sorted(data.get("analyst", {}).keys()),
|
||||
"earnings_keys": sorted(data.get("earnings", {}).keys()),
|
||||
"performance_keys": sorted(data.get("performance", {}).keys()),
|
||||
}
|
||||
|
||||
|
||||
def validate_payload(symbol, data):
|
||||
if "error" in data:
|
||||
return f"API error for {symbol}: {data}"
|
||||
if data.get("stock", "").upper() != symbol.upper():
|
||||
return f"Symbol mismatch: expected {symbol} got {data.get('stock')}"
|
||||
validation = data.get("validation", {})
|
||||
if validation.get("symbol_match") is not True:
|
||||
return f"Validation symbol_match failed for {symbol}: {validation}"
|
||||
if validation.get("issues"):
|
||||
return f"Validation issues for {symbol}: {validation}"
|
||||
|
||||
for section in REQUIRED_SECTIONS:
|
||||
if section not in data:
|
||||
return f"Missing section {section} for {symbol}"
|
||||
|
||||
key_metrics = data.get("key_metrics", {})
|
||||
for field in REQUIRED_KEY_METRICS:
|
||||
if field not in key_metrics:
|
||||
return f"Missing key metric {field} for {symbol}"
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Yahoo profile scraper test cycles")
|
||||
parser.add_argument(
|
||||
"--base-url",
|
||||
default="http://127.0.0.1:9777/profile",
|
||||
help="Base URL for /profile",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--symbols",
|
||||
default=",".join(DEFAULT_SYMBOLS),
|
||||
help="Comma-separated stock symbols",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--runs",
|
||||
type=int,
|
||||
default=8,
|
||||
help="Number of validation runs per symbol",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=180,
|
||||
help="Request timeout in seconds",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=0.2,
|
||||
help="Sleep between requests",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
symbols = parse_list(args.symbols, DEFAULT_SYMBOLS)
|
||||
signatures = {}
|
||||
|
||||
print(f"Running {args.runs} profile cycles for: {', '.join(symbols)}")
|
||||
for run in range(1, args.runs + 1):
|
||||
print(f"Cycle {run}/{args.runs}")
|
||||
for symbol in symbols:
|
||||
data = http_get(args.base_url, {"stock": symbol}, args.timeout)
|
||||
error = validate_payload(symbol, data)
|
||||
if error:
|
||||
print(f"ERROR: {error}")
|
||||
sys.exit(1)
|
||||
signature = build_signature(data)
|
||||
if symbol not in signatures:
|
||||
signatures[symbol] = signature
|
||||
elif signatures[symbol] != signature:
|
||||
print(f"ERROR: Signature changed for {symbol}")
|
||||
print(f"Baseline: {signatures[symbol]}")
|
||||
print(f"Current: {signature}")
|
||||
sys.exit(1)
|
||||
time.sleep(args.sleep)
|
||||
print(f"Cycle {run} OK")
|
||||
|
||||
print("\nAll profile cycles completed successfully.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user