From 690887a6ec29d7bee9638187e8c3f88678108a84 Mon Sep 17 00:00:00 2001 From: Rushabh Gosar Date: Sat, 27 Dec 2025 16:43:24 -0800 Subject: [PATCH] Add docker artifacts and agent context --- .dockerignore | 13 ++ .gitignore | 7 + AGENTS.md | 424 +++++++++++++++++++++++++++++++++++++++++++++ Dockerfile | 13 ++ scraper_service.py | 342 ++++++++++++++++++++++++++++++------ 5 files changed, 745 insertions(+), 54 deletions(-) create mode 100644 .dockerignore create mode 100644 .gitignore create mode 100644 AGENTS.md create mode 100644 Dockerfile diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..ad9d337 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,13 @@ +.git/ +.gitignore +__pycache__/ +*.pyc +venv/ +.venv/ +.env +.env.* +.pytest_cache/ +charts/ +yahoo.html +scraper_service(works).py +scraper_service.working.backup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0ed0e51 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +__pycache__/ +*.pyc +venv/ +.venv/ +.env +.env.* +.pytest_cache/ diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..9c5449a --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,424 @@ +# AGENTS.md + +## Context +- This project exposes a Flask API that uses Playwright to scrape Yahoo Finance options chains. +- Entry point: `scraper_service.py` (launched via `runner.bat` or directly with Python). +- API route: `GET /scrape_sync` with `stock` and optional `expiration|expiry|date` parameters. +- Expiration inputs: epoch seconds (Yahoo date param) or date strings supported by `DATE_FORMATS`. + +## Docker +- Build: `docker build -t :latest .` +- Run: `docker run --rm -p 9777:9777 :latest` +- The container uses the Playwright base image with bundled browsers. + +## Line-by-line explanation of scraper_service.py + +- Line 1: Import symbols from flask. Code: `from flask import Flask, jsonify, request` +- Line 2: Import symbols from playwright.sync_api. Code: `from playwright.sync_api import sync_playwright` +- Line 3: Import symbols from bs4. Code: `from bs4 import BeautifulSoup` +- Line 4: Import symbols from datetime. Code: `from datetime import datetime, timezone` +- Line 5: Import module urllib.parse. Code: `import urllib.parse` +- Line 6: Import module logging. Code: `import logging` +- Line 7: Import module re. Code: `import re` +- Line 8: Import module time. Code: `import time` +- Line 9: Blank line for readability. Code: `` +- Line 10: Create the Flask application instance. Code: `app = Flask(__name__)` +- Line 11: Blank line for readability. Code: `` +- Line 12: Comment describing the next block. Code: `# Logging` +- Line 13: Configure logging defaults. Code: `logging.basicConfig(` +- Line 14: Execute the statement as written. Code: `level=logging.INFO,` +- Line 15: Execute the statement as written. Code: `format="%(asctime)s [%(levelname)s] %(message)s"` +- Line 16: Close the current block or container. Code: `)` +- Line 17: Set the Flask logger level. Code: `app.logger.setLevel(logging.INFO)` +- Line 18: Blank line for readability. Code: `` +- Line 19: Define accepted expiration date string formats. Code: `DATE_FORMATS = (` +- Line 20: Execute the statement as written. Code: `"%Y-%m-%d",` +- Line 21: Execute the statement as written. Code: `"%Y/%m/%d",` +- Line 22: Execute the statement as written. Code: `"%Y%m%d",` +- Line 23: Execute the statement as written. Code: `"%b %d, %Y",` +- Line 24: Execute the statement as written. Code: `"%B %d, %Y",` +- Line 25: Close the current block or container. Code: `)` +- Line 26: Blank line for readability. Code: `` +- Line 27: Blank line for readability. Code: `` +- Line 28: Define the parse_date function. Code: `def parse_date(value):` +- Line 29: Loop over items. Code: `for fmt in DATE_FORMATS:` +- Line 30: Start a try block for error handling. Code: `try:` +- Line 31: Return a value to the caller. Code: `return datetime.strptime(value, fmt).date()` +- Line 32: Handle exceptions for the preceding try block. Code: `except ValueError:` +- Line 33: Execute the statement as written. Code: `continue` +- Line 34: Return a value to the caller. Code: `return None` +- Line 35: Blank line for readability. Code: `` +- Line 36: Blank line for readability. Code: `` +- Line 37: Define the normalize_label function. Code: `def normalize_label(value):` +- Line 38: Return a value to the caller. Code: `return " ".join(value.strip().split()).lower()` +- Line 39: Blank line for readability. Code: `` +- Line 40: Blank line for readability. Code: `` +- Line 41: Define the format_expiration_label function. Code: `def format_expiration_label(timestamp):` +- Line 42: Start a try block for error handling. Code: `try:` +- Line 43: Return a value to the caller. Code: `return datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d")` +- Line 44: Handle exceptions for the preceding try block. Code: `except Exception:` +- Line 45: Return a value to the caller. Code: `return str(timestamp)` +- Line 46: Blank line for readability. Code: `` +- Line 47: Blank line for readability. Code: `` +- Line 48: Define the extract_expiration_dates_from_html function. Code: `def extract_expiration_dates_from_html(html):` +- Line 49: Conditional branch. Code: `if not html:` +- Line 50: Return a value to the caller. Code: `return []` +- Line 51: Blank line for readability. Code: `` +- Line 52: Execute the statement as written. Code: `patterns = (` +- Line 53: Execute the statement as written. Code: `r'\\"expirationDates\\":\[(.*?)\]',` +- Line 54: Execute the statement as written. Code: `r'"expirationDates":\[(.*?)\]',` +- Line 55: Close the current block or container. Code: `)` +- Line 56: Execute the statement as written. Code: `match = None` +- Line 57: Loop over items. Code: `for pattern in patterns:` +- Line 58: Execute the statement as written. Code: `match = re.search(pattern, html, re.DOTALL)` +- Line 59: Conditional branch. Code: `if match:` +- Line 60: Execute the statement as written. Code: `break` +- Line 61: Conditional branch. Code: `if not match:` +- Line 62: Return a value to the caller. Code: `return []` +- Line 63: Blank line for readability. Code: `` +- Line 64: Execute the statement as written. Code: `raw = match.group(1)` +- Line 65: Execute the statement as written. Code: `values = []` +- Line 66: Loop over items. Code: `for part in raw.split(","):` +- Line 67: Execute the statement as written. Code: `part = part.strip()` +- Line 68: Conditional branch. Code: `if part.isdigit():` +- Line 69: Start a try block for error handling. Code: `try:` +- Line 70: Execute the statement as written. Code: `values.append(int(part))` +- Line 71: Handle exceptions for the preceding try block. Code: `except Exception:` +- Line 72: Execute the statement as written. Code: `continue` +- Line 73: Return a value to the caller. Code: `return values` +- Line 74: Blank line for readability. Code: `` +- Line 75: Blank line for readability. Code: `` +- Line 76: Define the build_expiration_options function. Code: `def build_expiration_options(expiration_dates):` +- Line 77: Execute the statement as written. Code: `options = []` +- Line 78: Loop over items. Code: `for value in expiration_dates or []:` +- Line 79: Start a try block for error handling. Code: `try:` +- Line 80: Execute the statement as written. Code: `value_int = int(value)` +- Line 81: Handle exceptions for the preceding try block. Code: `except Exception:` +- Line 82: Execute the statement as written. Code: `continue` +- Line 83: Blank line for readability. Code: `` +- Line 84: Execute the statement as written. Code: `label = format_expiration_label(value_int)` +- Line 85: Start a try block for error handling. Code: `try:` +- Line 86: Execute the statement as written. Code: `date_value = datetime.utcfromtimestamp(value_int).date()` +- Line 87: Handle exceptions for the preceding try block. Code: `except Exception:` +- Line 88: Execute the statement as written. Code: `date_value = None` +- Line 89: Blank line for readability. Code: `` +- Line 90: Execute the statement as written. Code: `options.append({"value": value_int, "label": label, "date": date_value})` +- Line 91: Return a value to the caller. Code: `return sorted(options, key=lambda x: x["value"])` +- Line 92: Blank line for readability. Code: `` +- Line 93: Blank line for readability. Code: `` +- Line 94: Define the resolve_expiration function. Code: `def resolve_expiration(expiration, options):` +- Line 95: Conditional branch. Code: `if not expiration:` +- Line 96: Return a value to the caller. Code: `return None, None` +- Line 97: Blank line for readability. Code: `` +- Line 98: Execute the statement as written. Code: `raw = expiration.strip()` +- Line 99: Conditional branch. Code: `if not raw:` +- Line 100: Return a value to the caller. Code: `return None, None` +- Line 101: Blank line for readability. Code: `` +- Line 102: Conditional branch. Code: `if raw.isdigit():` +- Line 103: Execute the statement as written. Code: `value = int(raw)` +- Line 104: Conditional branch. Code: `if options:` +- Line 105: Loop over items. Code: `for opt in options:` +- Line 106: Conditional branch. Code: `if opt.get("value") == value:` +- Line 107: Return a value to the caller. Code: `return value, opt.get("label")` +- Line 108: Return a value to the caller. Code: `return None, None` +- Line 109: Return a value to the caller. Code: `return value, format_expiration_label(value)` +- Line 110: Blank line for readability. Code: `` +- Line 111: Execute the statement as written. Code: `requested_date = parse_date(raw)` +- Line 112: Conditional branch. Code: `if requested_date:` +- Line 113: Loop over items. Code: `for opt in options:` +- Line 114: Conditional branch. Code: `if opt.get("date") == requested_date:` +- Line 115: Return a value to the caller. Code: `return opt.get("value"), opt.get("label")` +- Line 116: Return a value to the caller. Code: `return None, None` +- Line 117: Blank line for readability. Code: `` +- Line 118: Execute the statement as written. Code: `normalized = normalize_label(raw)` +- Line 119: Loop over items. Code: `for opt in options:` +- Line 120: Conditional branch. Code: `if normalize_label(opt.get("label", "")) == normalized:` +- Line 121: Return a value to the caller. Code: `return opt.get("value"), opt.get("label")` +- Line 122: Blank line for readability. Code: `` +- Line 123: Return a value to the caller. Code: `return None, None` +- Line 124: Blank line for readability. Code: `` +- Line 125: Blank line for readability. Code: `` +- Line 126: Define the wait_for_tables function. Code: `def wait_for_tables(page):` +- Line 127: Start a try block for error handling. Code: `try:` +- Line 128: Interact with the Playwright page. Code: `page.wait_for_selector(` +- Line 129: Execute the statement as written. Code: `"section[data-testid='options-list-table'] table",` +- Line 130: Execute the statement as written. Code: `timeout=30000,` +- Line 131: Close the current block or container. Code: `)` +- Line 132: Handle exceptions for the preceding try block. Code: `except Exception:` +- Line 133: Interact with the Playwright page. Code: `page.wait_for_selector("table", timeout=30000)` +- Line 134: Blank line for readability. Code: `` +- Line 135: Loop over items. Code: `for _ in range(30): # 30 * 1s = 30 seconds` +- Line 136: Collect option tables from the page. Code: `tables = page.query_selector_all(` +- Line 137: Execute the statement as written. Code: `"section[data-testid='options-list-table'] table"` +- Line 138: Close the current block or container. Code: `)` +- Line 139: Conditional branch. Code: `if len(tables) >= 2:` +- Line 140: Return a value to the caller. Code: `return tables` +- Line 141: Collect option tables from the page. Code: `tables = page.query_selector_all("table")` +- Line 142: Conditional branch. Code: `if len(tables) >= 2:` +- Line 143: Return a value to the caller. Code: `return tables` +- Line 144: Execute the statement as written. Code: `time.sleep(1)` +- Line 145: Return a value to the caller. Code: `return []` +- Line 146: Blank line for readability. Code: `` +- Line 147: Blank line for readability. Code: `` +- Line 148: Define the scrape_yahoo_options function. Code: `def scrape_yahoo_options(symbol, expiration=None):` +- Line 149: URL-encode the stock symbol. Code: `encoded = urllib.parse.quote(symbol, safe="")` +- Line 150: Build the base Yahoo Finance options URL. Code: `base_url = f"https://finance.yahoo.com/quote/{encoded}/options/"` +- Line 151: Normalize the expiration input string. Code: `requested_expiration = expiration.strip() if expiration else None` +- Line 152: Conditional branch. Code: `if not requested_expiration:` +- Line 153: Normalize the expiration input string. Code: `requested_expiration = None` +- Line 154: Set the URL to load. Code: `url = base_url` +- Line 155: Blank line for readability. Code: `` +- Line 156: Emit or configure a log message. Code: `app.logger.info(` +- Line 157: Execute the statement as written. Code: `"Starting scrape for symbol=%s expiration=%s url=%s",` +- Line 158: Execute the statement as written. Code: `symbol,` +- Line 159: Execute the statement as written. Code: `requested_expiration,` +- Line 160: Execute the statement as written. Code: `base_url,` +- Line 161: Close the current block or container. Code: `)` +- Line 162: Blank line for readability. Code: `` +- Line 163: Reserve storage for options table HTML. Code: `calls_html = None` +- Line 164: Reserve storage for options table HTML. Code: `puts_html = None` +- Line 165: Initialize or assign the current price. Code: `price = None` +- Line 166: Track the resolved expiration metadata. Code: `selected_expiration_value = None` +- Line 167: Track the resolved expiration metadata. Code: `selected_expiration_label = None` +- Line 168: Prepare or update the list of available expirations. Code: `expiration_options = []` +- Line 169: Track the resolved expiration epoch timestamp. Code: `target_date = None` +- Line 170: Track whether a base-page lookup is needed. Code: `fallback_to_base = False` +- Line 171: Blank line for readability. Code: `` +- Line 172: Enter a context manager block. Code: `with sync_playwright() as p:` +- Line 173: Launch a Playwright browser instance. Code: `browser = p.chromium.launch(headless=True)` +- Line 174: Create a new Playwright page. Code: `page = browser.new_page()` +- Line 175: Interact with the Playwright page. Code: `page.set_extra_http_headers(` +- Line 176: Execute the statement as written. Code: `{` +- Line 177: Execute the statement as written. Code: `"User-Agent": (` +- Line 178: Execute the statement as written. Code: `"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "` +- Line 179: Execute the statement as written. Code: `"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"` +- Line 180: Close the current block or container. Code: `)` +- Line 181: Close the current block or container. Code: `}` +- Line 182: Close the current block or container. Code: `)` +- Line 183: Interact with the Playwright page. Code: `page.set_default_timeout(60000)` +- Line 184: Blank line for readability. Code: `` +- Line 185: Start a try block for error handling. Code: `try:` +- Line 186: Conditional branch. Code: `if requested_expiration:` +- Line 187: Conditional branch. Code: `if requested_expiration.isdigit():` +- Line 188: Track the resolved expiration epoch timestamp. Code: `target_date = int(requested_expiration)` +- Line 189: Track the resolved expiration metadata. Code: `selected_expiration_value = target_date` +- Line 190: Track the resolved expiration metadata. Code: `selected_expiration_label = format_expiration_label(target_date)` +- Line 191: Fallback branch. Code: `else:` +- Line 192: Execute the statement as written. Code: `parsed_date = parse_date(requested_expiration)` +- Line 193: Conditional branch. Code: `if parsed_date:` +- Line 194: Track the resolved expiration epoch timestamp. Code: `target_date = int(` +- Line 195: Execute the statement as written. Code: `datetime(` +- Line 196: Execute the statement as written. Code: `parsed_date.year,` +- Line 197: Execute the statement as written. Code: `parsed_date.month,` +- Line 198: Execute the statement as written. Code: `parsed_date.day,` +- Line 199: Execute the statement as written. Code: `tzinfo=timezone.utc,` +- Line 200: Execute the statement as written. Code: `).timestamp()` +- Line 201: Close the current block or container. Code: `)` +- Line 202: Track the resolved expiration metadata. Code: `selected_expiration_value = target_date` +- Line 203: Track the resolved expiration metadata. Code: `selected_expiration_label = format_expiration_label(target_date)` +- Line 204: Fallback branch. Code: `else:` +- Line 205: Track whether a base-page lookup is needed. Code: `fallback_to_base = True` +- Line 206: Blank line for readability. Code: `` +- Line 207: Conditional branch. Code: `if target_date:` +- Line 208: Set the URL to load. Code: `url = f"{base_url}?date={target_date}"` +- Line 209: Blank line for readability. Code: `` +- Line 210: Navigate the Playwright page to the target URL. Code: `page.goto(url, wait_until="domcontentloaded", timeout=60000)` +- Line 211: Emit or configure a log message. Code: `app.logger.info("Page loaded (domcontentloaded) for %s", symbol)` +- Line 212: Blank line for readability. Code: `` +- Line 213: Capture the page HTML content. Code: `html = page.content()` +- Line 214: Extract expiration date timestamps from the HTML. Code: `expiration_dates = extract_expiration_dates_from_html(html)` +- Line 215: Prepare or update the list of available expirations. Code: `expiration_options = build_expiration_options(expiration_dates)` +- Line 216: Blank line for readability. Code: `` +- Line 217: Conditional branch. Code: `if fallback_to_base:` +- Line 218: Execute the statement as written. Code: `resolved_value, resolved_label = resolve_expiration(` +- Line 219: Execute the statement as written. Code: `requested_expiration, expiration_options` +- Line 220: Close the current block or container. Code: `)` +- Line 221: Conditional branch. Code: `if resolved_value is None:` +- Line 222: Return a value to the caller. Code: `return {` +- Line 223: Execute the statement as written. Code: `"error": "Requested expiration not available",` +- Line 224: Execute the statement as written. Code: `"stock": symbol,` +- Line 225: Execute the statement as written. Code: `"requested_expiration": requested_expiration,` +- Line 226: Execute the statement as written. Code: `"available_expirations": [` +- Line 227: Execute the statement as written. Code: `{"label": opt.get("label"), "value": opt.get("value")}` +- Line 228: Loop over items. Code: `for opt in expiration_options` +- Line 229: Close the current block or container. Code: `],` +- Line 230: Close the current block or container. Code: `}` +- Line 231: Blank line for readability. Code: `` +- Line 232: Track the resolved expiration epoch timestamp. Code: `target_date = resolved_value` +- Line 233: Track the resolved expiration metadata. Code: `selected_expiration_value = resolved_value` +- Line 234: Track the resolved expiration metadata. Code: `selected_expiration_label = resolved_label or format_expiration_label(` +- Line 235: Execute the statement as written. Code: `resolved_value` +- Line 236: Close the current block or container. Code: `)` +- Line 237: Set the URL to load. Code: `url = f"{base_url}?date={resolved_value}"` +- Line 238: Navigate the Playwright page to the target URL. Code: `page.goto(url, wait_until="domcontentloaded", timeout=60000)` +- Line 239: Emit or configure a log message. Code: `app.logger.info("Page loaded (domcontentloaded) for %s", symbol)` +- Line 240: Blank line for readability. Code: `` +- Line 241: Capture the page HTML content. Code: `html = page.content()` +- Line 242: Extract expiration date timestamps from the HTML. Code: `expiration_dates = extract_expiration_dates_from_html(html)` +- Line 243: Prepare or update the list of available expirations. Code: `expiration_options = build_expiration_options(expiration_dates)` +- Line 244: Blank line for readability. Code: `` +- Line 245: Conditional branch. Code: `if target_date and expiration_options:` +- Line 246: Execute the statement as written. Code: `matched = None` +- Line 247: Loop over items. Code: `for opt in expiration_options:` +- Line 248: Conditional branch. Code: `if opt.get("value") == target_date:` +- Line 249: Execute the statement as written. Code: `matched = opt` +- Line 250: Execute the statement as written. Code: `break` +- Line 251: Conditional branch. Code: `if not matched:` +- Line 252: Return a value to the caller. Code: `return {` +- Line 253: Execute the statement as written. Code: `"error": "Requested expiration not available",` +- Line 254: Execute the statement as written. Code: `"stock": symbol,` +- Line 255: Execute the statement as written. Code: `"requested_expiration": requested_expiration,` +- Line 256: Execute the statement as written. Code: `"available_expirations": [` +- Line 257: Execute the statement as written. Code: `{"label": opt.get("label"), "value": opt.get("value")}` +- Line 258: Loop over items. Code: `for opt in expiration_options` +- Line 259: Close the current block or container. Code: `],` +- Line 260: Close the current block or container. Code: `}` +- Line 261: Track the resolved expiration metadata. Code: `selected_expiration_label = matched.get("label")` +- Line 262: Alternative conditional branch. Code: `elif expiration_options and not target_date:` +- Line 263: Track the resolved expiration metadata. Code: `selected_expiration_value = expiration_options[0].get("value")` +- Line 264: Track the resolved expiration metadata. Code: `selected_expiration_label = expiration_options[0].get("label")` +- Line 265: Blank line for readability. Code: `` +- Line 266: Emit or configure a log message. Code: `app.logger.info("Waiting for options tables...")` +- Line 267: Blank line for readability. Code: `` +- Line 268: Collect option tables from the page. Code: `tables = wait_for_tables(page)` +- Line 269: Conditional branch. Code: `if len(tables) < 2:` +- Line 270: Emit or configure a log message. Code: `app.logger.error(` +- Line 271: Execute the statement as written. Code: `"Only %d tables found; expected 2. HTML may have changed.",` +- Line 272: Execute the statement as written. Code: `len(tables),` +- Line 273: Close the current block or container. Code: `)` +- Line 274: Return a value to the caller. Code: `return {"error": "Could not locate options tables", "stock": symbol}` +- Line 275: Blank line for readability. Code: `` +- Line 276: Emit or configure a log message. Code: `app.logger.info("Found %d tables. Extracting Calls & Puts.", len(tables))` +- Line 277: Blank line for readability. Code: `` +- Line 278: Reserve storage for options table HTML. Code: `calls_html = tables[0].evaluate("el => el.outerHTML")` +- Line 279: Reserve storage for options table HTML. Code: `puts_html = tables[1].evaluate("el => el.outerHTML")` +- Line 280: Blank line for readability. Code: `` +- Line 281: Comment describing the next block. Code: `# --- Extract current price ---` +- Line 282: Start a try block for error handling. Code: `try:` +- Line 283: Comment describing the next block. Code: `# Primary selector` +- Line 284: Read the current price text from the page. Code: `price_text = page.locator(` +- Line 285: Execute the statement as written. Code: `"fin-streamer[data-field='regularMarketPrice']"` +- Line 286: Execute the statement as written. Code: `).inner_text()` +- Line 287: Initialize or assign the current price. Code: `price = float(price_text.replace(",", ""))` +- Line 288: Handle exceptions for the preceding try block. Code: `except Exception:` +- Line 289: Start a try block for error handling. Code: `try:` +- Line 290: Comment describing the next block. Code: `# Fallback` +- Line 291: Read the current price text from the page. Code: `price_text = page.locator("span[data-testid='qsp-price']").inner_text()` +- Line 292: Initialize or assign the current price. Code: `price = float(price_text.replace(",", ""))` +- Line 293: Handle exceptions for the preceding try block. Code: `except Exception as e:` +- Line 294: Emit or configure a log message. Code: `app.logger.warning("Failed to extract price for %s: %s", symbol, e)` +- Line 295: Blank line for readability. Code: `` +- Line 296: Emit or configure a log message. Code: `app.logger.info("Current price for %s = %s", symbol, price)` +- Line 297: Execute the statement as written. Code: `finally:` +- Line 298: Execute the statement as written. Code: `browser.close()` +- Line 299: Blank line for readability. Code: `` +- Line 300: Comment describing the next block. Code: `# ----------------------------------------------------------------------` +- Line 301: Comment describing the next block. Code: `# Parsing Table HTML` +- Line 302: Comment describing the next block. Code: `# ----------------------------------------------------------------------` +- Line 303: Define the parse_table function. Code: `def parse_table(table_html, side):` +- Line 304: Conditional branch. Code: `if not table_html:` +- Line 305: Emit or configure a log message. Code: `app.logger.warning("No %s table HTML for %s", side, symbol)` +- Line 306: Return a value to the caller. Code: `return []` +- Line 307: Blank line for readability. Code: `` +- Line 308: Execute the statement as written. Code: `soup = BeautifulSoup(table_html, "html.parser")` +- Line 309: Blank line for readability. Code: `` +- Line 310: Extract header labels from the table. Code: `headers = [th.get_text(strip=True) for th in soup.select("thead th")]` +- Line 311: Collect table rows for parsing. Code: `rows = soup.select("tbody tr")` +- Line 312: Blank line for readability. Code: `` +- Line 313: Initialize the parsed rows list. Code: `parsed = []` +- Line 314: Loop over items. Code: `for r in rows:` +- Line 315: Collect table cells for the current row. Code: `tds = r.find_all("td")` +- Line 316: Conditional branch. Code: `if len(tds) != len(headers):` +- Line 317: Execute the statement as written. Code: `continue` +- Line 318: Blank line for readability. Code: `` +- Line 319: Initialize a row dictionary. Code: `item = {}` +- Line 320: Loop over items. Code: `for i, c in enumerate(tds):` +- Line 321: Read the header name for the current column. Code: `key = headers[i]` +- Line 322: Read or convert the cell value. Code: `val = c.get_text(" ", strip=True)` +- Line 323: Blank line for readability. Code: `` +- Line 324: Comment describing the next block. Code: `# Convert numeric fields` +- Line 325: Conditional branch. Code: `if key in ["Strike", "Last Price", "Bid", "Ask", "Change"]:` +- Line 326: Start a try block for error handling. Code: `try:` +- Line 327: Read or convert the cell value. Code: `val = float(val.replace(",", ""))` +- Line 328: Handle exceptions for the preceding try block. Code: `except Exception:` +- Line 329: Read or convert the cell value. Code: `val = None` +- Line 330: Alternative conditional branch. Code: `elif key in ["Volume", "Open Interest"]:` +- Line 331: Start a try block for error handling. Code: `try:` +- Line 332: Read or convert the cell value. Code: `val = int(val.replace(",", ""))` +- Line 333: Handle exceptions for the preceding try block. Code: `except Exception:` +- Line 334: Read or convert the cell value. Code: `val = None` +- Line 335: Alternative conditional branch. Code: `elif val in ["-", ""]:` +- Line 336: Read or convert the cell value. Code: `val = None` +- Line 337: Blank line for readability. Code: `` +- Line 338: Execute the statement as written. Code: `item[key] = val` +- Line 339: Blank line for readability. Code: `` +- Line 340: Execute the statement as written. Code: `parsed.append(item)` +- Line 341: Blank line for readability. Code: `` +- Line 342: Emit or configure a log message. Code: `app.logger.info("Parsed %d %s rows", len(parsed), side)` +- Line 343: Return a value to the caller. Code: `return parsed` +- Line 344: Blank line for readability. Code: `` +- Line 345: Parse the full calls and puts tables. Code: `calls_full = parse_table(calls_html, "calls")` +- Line 346: Parse the full calls and puts tables. Code: `puts_full = parse_table(puts_html, "puts")` +- Line 347: Blank line for readability. Code: `` +- Line 348: Comment describing the next block. Code: `# ----------------------------------------------------------------------` +- Line 349: Comment describing the next block. Code: `# Pruning logic` +- Line 350: Comment describing the next block. Code: `# ----------------------------------------------------------------------` +- Line 351: Define the prune_nearest function. Code: `def prune_nearest(options, price_value, limit=26, side=""):` +- Line 352: Conditional branch. Code: `if price_value is None:` +- Line 353: Return a value to the caller. Code: `return options, 0` +- Line 354: Blank line for readability. Code: `` +- Line 355: Filter options to numeric strike entries. Code: `numeric = [o for o in options if isinstance(o.get("Strike"), (int, float))]` +- Line 356: Blank line for readability. Code: `` +- Line 357: Conditional branch. Code: `if len(numeric) <= limit:` +- Line 358: Return a value to the caller. Code: `return numeric, 0` +- Line 359: Blank line for readability. Code: `` +- Line 360: Sort options by distance to current price. Code: `sorted_opts = sorted(numeric, key=lambda x: abs(x["Strike"] - price_value))` +- Line 361: Keep the closest strike entries. Code: `pruned = sorted_opts[:limit]` +- Line 362: Compute how many rows were pruned. Code: `pruned_count = len(options) - len(pruned)` +- Line 363: Return a value to the caller. Code: `return pruned, pruned_count` +- Line 364: Blank line for readability. Code: `` +- Line 365: Apply pruning to calls. Code: `calls, pruned_calls = prune_nearest(calls_full, price, side="calls")` +- Line 366: Apply pruning to puts. Code: `puts, pruned_puts = prune_nearest(puts_full, price, side="puts")` +- Line 367: Blank line for readability. Code: `` +- Line 368: Define the strike_range function. Code: `def strike_range(opts):` +- Line 369: Collect strike prices from the option list. Code: `strikes = [o["Strike"] for o in opts if isinstance(o.get("Strike"), (int, float))]` +- Line 370: Return a value to the caller. Code: `return [min(strikes), max(strikes)] if strikes else [None, None]` +- Line 371: Blank line for readability. Code: `` +- Line 372: Return a value to the caller. Code: `return {` +- Line 373: Execute the statement as written. Code: `"stock": symbol,` +- Line 374: Execute the statement as written. Code: `"url": url,` +- Line 375: Execute the statement as written. Code: `"requested_expiration": requested_expiration,` +- Line 376: Execute the statement as written. Code: `"selected_expiration": {` +- Line 377: Execute the statement as written. Code: `"value": selected_expiration_value,` +- Line 378: Execute the statement as written. Code: `"label": selected_expiration_label,` +- Line 379: Close the current block or container. Code: `},` +- Line 380: Execute the statement as written. Code: `"current_price": price,` +- Line 381: Execute the statement as written. Code: `"calls": calls,` +- Line 382: Execute the statement as written. Code: `"puts": puts,` +- Line 383: Execute the statement as written. Code: `"calls_strike_range": strike_range(calls),` +- Line 384: Execute the statement as written. Code: `"puts_strike_range": strike_range(puts),` +- Line 385: Execute the statement as written. Code: `"total_calls": len(calls),` +- Line 386: Execute the statement as written. Code: `"total_puts": len(puts),` +- Line 387: Execute the statement as written. Code: `"pruned_calls_count": pruned_calls,` +- Line 388: Execute the statement as written. Code: `"pruned_puts_count": pruned_puts,` +- Line 389: Close the current block or container. Code: `}` +- Line 390: Blank line for readability. Code: `` +- Line 391: Blank line for readability. Code: `` +- Line 392: Attach the route decorator to the handler. Code: `@app.route("/scrape_sync")` +- Line 393: Define the scrape_sync function. Code: `def scrape_sync():` +- Line 394: Read the stock symbol parameter. Code: `symbol = request.args.get("stock", "MSFT")` +- Line 395: Read the expiration parameters from the request. Code: `expiration = (` +- Line 396: Execute the statement as written. Code: `request.args.get("expiration")` +- Line 397: Execute the statement as written. Code: `or request.args.get("expiry")` +- Line 398: Execute the statement as written. Code: `or request.args.get("date")` +- Line 399: Close the current block or container. Code: `)` +- Line 400: Emit or configure a log message. Code: `app.logger.info(` +- Line 401: Execute the statement as written. Code: `"Received /scrape_sync request for symbol=%s expiration=%s",` +- Line 402: Execute the statement as written. Code: `symbol,` +- Line 403: Execute the statement as written. Code: `expiration,` +- Line 404: Close the current block or container. Code: `)` +- Line 405: Return a value to the caller. Code: `return jsonify(scrape_yahoo_options(symbol, expiration))` +- Line 406: Blank line for readability. Code: `` +- Line 407: Blank line for readability. Code: `` +- Line 408: Conditional branch. Code: `if __name__ == "__main__":` +- Line 409: Run the Flask development server. Code: `app.run(host="0.0.0.0", port=9777)` diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..fbceef8 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,13 @@ +FROM mcr.microsoft.com/playwright/python:v1.50.0-jammy + +WORKDIR /app + +ENV PYTHONUNBUFFERED=1 + +COPY scraper_service.py /app/scraper_service.py + +RUN python -m pip install --no-cache-dir flask beautifulsoup4 + +EXPOSE 9777 + +CMD ["python", "scraper_service.py"] diff --git a/scraper_service.py b/scraper_service.py index 9b8fb61..3786cd0 100644 --- a/scraper_service.py +++ b/scraper_service.py @@ -1,8 +1,10 @@ from flask import Flask, jsonify, request from playwright.sync_api import sync_playwright from bs4 import BeautifulSoup +from datetime import datetime, timezone import urllib.parse import logging +import re import time app = Flask(__name__) @@ -14,68 +16,286 @@ logging.basicConfig( ) app.logger.setLevel(logging.INFO) +DATE_FORMATS = ( + "%Y-%m-%d", + "%Y/%m/%d", + "%Y%m%d", + "%b %d, %Y", + "%B %d, %Y", +) -def scrape_yahoo_options(symbol): + +def parse_date(value): + for fmt in DATE_FORMATS: + try: + return datetime.strptime(value, fmt).date() + except ValueError: + continue + return None + + +def normalize_label(value): + return " ".join(value.strip().split()).lower() + + +def format_expiration_label(timestamp): + try: + return datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d") + except Exception: + return str(timestamp) + + +def extract_expiration_dates_from_html(html): + if not html: + return [] + + patterns = ( + r'\\"expirationDates\\":\[(.*?)\]', + r'"expirationDates":\[(.*?)\]', + ) + match = None + for pattern in patterns: + match = re.search(pattern, html, re.DOTALL) + if match: + break + if not match: + return [] + + raw = match.group(1) + values = [] + for part in raw.split(","): + part = part.strip() + if part.isdigit(): + try: + values.append(int(part)) + except Exception: + continue + return values + + +def build_expiration_options(expiration_dates): + options = [] + for value in expiration_dates or []: + try: + value_int = int(value) + except Exception: + continue + + label = format_expiration_label(value_int) + try: + date_value = datetime.utcfromtimestamp(value_int).date() + except Exception: + date_value = None + + options.append({"value": value_int, "label": label, "date": date_value}) + return sorted(options, key=lambda x: x["value"]) + + +def resolve_expiration(expiration, options): + if not expiration: + return None, None + + raw = expiration.strip() + if not raw: + return None, None + + if raw.isdigit(): + value = int(raw) + if options: + for opt in options: + if opt.get("value") == value: + return value, opt.get("label") + return None, None + return value, format_expiration_label(value) + + requested_date = parse_date(raw) + if requested_date: + for opt in options: + if opt.get("date") == requested_date: + return opt.get("value"), opt.get("label") + return None, None + + normalized = normalize_label(raw) + for opt in options: + if normalize_label(opt.get("label", "")) == normalized: + return opt.get("value"), opt.get("label") + + return None, None + + +def wait_for_tables(page): + try: + page.wait_for_selector( + "section[data-testid='options-list-table'] table", + timeout=30000, + ) + except Exception: + page.wait_for_selector("table", timeout=30000) + + for _ in range(30): # 30 * 1s = 30 seconds + tables = page.query_selector_all( + "section[data-testid='options-list-table'] table" + ) + if len(tables) >= 2: + return tables + tables = page.query_selector_all("table") + if len(tables) >= 2: + return tables + time.sleep(1) + return [] + + +def scrape_yahoo_options(symbol, expiration=None): encoded = urllib.parse.quote(symbol, safe="") - url = f"https://finance.yahoo.com/quote/{encoded}/options/" + base_url = f"https://finance.yahoo.com/quote/{encoded}/options/" + requested_expiration = expiration.strip() if expiration else None + if not requested_expiration: + requested_expiration = None + url = base_url - app.logger.info("Starting scrape for symbol=%s url=%s", symbol, url) + app.logger.info( + "Starting scrape for symbol=%s expiration=%s url=%s", + symbol, + requested_expiration, + base_url, + ) + + calls_html = None + puts_html = None + price = None + selected_expiration_value = None + selected_expiration_label = None + expiration_options = [] + target_date = None + fallback_to_base = False with sync_playwright() as p: browser = p.chromium.launch(headless=True) page = browser.new_page() - page.set_extra_http_headers({ - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36" - ) - }) + page.set_extra_http_headers( + { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36" + ) + } + ) + page.set_default_timeout(60000) - page.goto(url, wait_until="domcontentloaded", timeout=60000) - app.logger.info("Page loaded (domcontentloaded) for %s", symbol) - - # --- FIXED: Yahoo changed all classnames. We no longer depend on them. --- - # We simply wait until at least TWO tags appear. - app.logger.info("Waiting for options tables...") - - # Wait for any table to exist - page.wait_for_selector("table", timeout=30000) - - # Repeatedly check until 2 tables appear - for _ in range(30): # 30 × 1s = 30 seconds - tables = page.query_selector_all("table") - if len(tables) >= 2: - break - time.sleep(1) - - tables = page.query_selector_all("table") - if len(tables) < 2: - app.logger.error("Only %d tables found — expected 2. HTML changed?", len(tables)) - browser.close() - return {"error": "Could not locate options tables", "stock": symbol} - - app.logger.info("Found %d tables. Extracting Calls & Puts.", len(tables)) - - calls_html = tables[0].evaluate("el => el.outerHTML") - puts_html = tables[1].evaluate("el => el.outerHTML") - - # --- Extract current price --- - price = None try: - # Primary selector - price_text = page.locator("fin-streamer[data-field='regularMarketPrice']").inner_text() - price = float(price_text.replace(",", "")) - except: + if requested_expiration: + if requested_expiration.isdigit(): + target_date = int(requested_expiration) + selected_expiration_value = target_date + selected_expiration_label = format_expiration_label(target_date) + else: + parsed_date = parse_date(requested_expiration) + if parsed_date: + target_date = int( + datetime( + parsed_date.year, + parsed_date.month, + parsed_date.day, + tzinfo=timezone.utc, + ).timestamp() + ) + selected_expiration_value = target_date + selected_expiration_label = format_expiration_label(target_date) + else: + fallback_to_base = True + + if target_date: + url = f"{base_url}?date={target_date}" + + page.goto(url, wait_until="domcontentloaded", timeout=60000) + app.logger.info("Page loaded (domcontentloaded) for %s", symbol) + + html = page.content() + expiration_dates = extract_expiration_dates_from_html(html) + expiration_options = build_expiration_options(expiration_dates) + + if fallback_to_base: + resolved_value, resolved_label = resolve_expiration( + requested_expiration, expiration_options + ) + if resolved_value is None: + return { + "error": "Requested expiration not available", + "stock": symbol, + "requested_expiration": requested_expiration, + "available_expirations": [ + {"label": opt.get("label"), "value": opt.get("value")} + for opt in expiration_options + ], + } + + target_date = resolved_value + selected_expiration_value = resolved_value + selected_expiration_label = resolved_label or format_expiration_label( + resolved_value + ) + url = f"{base_url}?date={resolved_value}" + page.goto(url, wait_until="domcontentloaded", timeout=60000) + app.logger.info("Page loaded (domcontentloaded) for %s", symbol) + + html = page.content() + expiration_dates = extract_expiration_dates_from_html(html) + expiration_options = build_expiration_options(expiration_dates) + + if target_date and expiration_options: + matched = None + for opt in expiration_options: + if opt.get("value") == target_date: + matched = opt + break + if not matched: + return { + "error": "Requested expiration not available", + "stock": symbol, + "requested_expiration": requested_expiration, + "available_expirations": [ + {"label": opt.get("label"), "value": opt.get("value")} + for opt in expiration_options + ], + } + selected_expiration_label = matched.get("label") + elif expiration_options and not target_date: + selected_expiration_value = expiration_options[0].get("value") + selected_expiration_label = expiration_options[0].get("label") + + app.logger.info("Waiting for options tables...") + + tables = wait_for_tables(page) + if len(tables) < 2: + app.logger.error( + "Only %d tables found; expected 2. HTML may have changed.", + len(tables), + ) + return {"error": "Could not locate options tables", "stock": symbol} + + app.logger.info("Found %d tables. Extracting Calls & Puts.", len(tables)) + + calls_html = tables[0].evaluate("el => el.outerHTML") + puts_html = tables[1].evaluate("el => el.outerHTML") + + # --- Extract current price --- try: - # Fallback - price_text = page.locator("span[data-testid='qsp-price']").inner_text() + # Primary selector + price_text = page.locator( + "fin-streamer[data-field='regularMarketPrice']" + ).inner_text() price = float(price_text.replace(",", "")) - except Exception as e: - app.logger.warning("Failed to extract price for %s: %s", symbol, e) + except Exception: + try: + # Fallback + price_text = page.locator("span[data-testid='qsp-price']").inner_text() + price = float(price_text.replace(",", "")) + except Exception as e: + app.logger.warning("Failed to extract price for %s: %s", symbol, e) - app.logger.info("Current price for %s = %s", symbol, price) - - browser.close() + app.logger.info("Current price for %s = %s", symbol, price) + finally: + browser.close() # ---------------------------------------------------------------------- # Parsing Table HTML @@ -105,12 +325,12 @@ def scrape_yahoo_options(symbol): if key in ["Strike", "Last Price", "Bid", "Ask", "Change"]: try: val = float(val.replace(",", "")) - except: + except Exception: val = None elif key in ["Volume", "Open Interest"]: try: val = int(val.replace(",", "")) - except: + except Exception: val = None elif val in ["-", ""]: val = None @@ -152,6 +372,11 @@ def scrape_yahoo_options(symbol): return { "stock": symbol, "url": url, + "requested_expiration": requested_expiration, + "selected_expiration": { + "value": selected_expiration_value, + "label": selected_expiration_label, + }, "current_price": price, "calls": calls, "puts": puts, @@ -167,8 +392,17 @@ def scrape_yahoo_options(symbol): @app.route("/scrape_sync") def scrape_sync(): symbol = request.args.get("stock", "MSFT") - app.logger.info("Received /scrape_sync request for symbol=%s", symbol) - return jsonify(scrape_yahoo_options(symbol)) + expiration = ( + request.args.get("expiration") + or request.args.get("expiry") + or request.args.get("date") + ) + app.logger.info( + "Received /scrape_sync request for symbol=%s expiration=%s", + symbol, + expiration, + ) + return jsonify(scrape_yahoo_options(symbol, expiration)) if __name__ == "__main__":