From 9a55021063fbb14ffaf8e99b4afaf49ae90494a6 Mon Sep 17 00:00:00 2001 From: Rushabh Gosar Date: Sat, 6 Dec 2025 23:55:29 -0800 Subject: [PATCH] Fix the table not loading, also fix runner to install playwright --- runner.bat | 46 ++++++++++--- scraper_service.py | 167 +++++++++++++++++++++------------------------ 2 files changed, 112 insertions(+), 101 deletions(-) diff --git a/runner.bat b/runner.bat index d43b5f6..4f6dc4f 100644 --- a/runner.bat +++ b/runner.bat @@ -1,24 +1,48 @@ @echo off -setlocal +setlocal ENABLEDELAYEDEXPANSION -:: Set the project folder to this script's directory +:: Set project directory to script's location set "PROJECT_DIR=%~dp0" cd /d "%PROJECT_DIR%" -:: Check if venv folder exists; if not, create venv and install requirements +echo ----------------------------------------- +echo Checking virtual environment... +echo ----------------------------------------- + +:: Create venv if missing if not exist "venv\Scripts\python.exe" ( echo Creating virtual environment... python -m venv venv - call venv\Scripts\activate.bat - echo Installing required packages... - pip install --upgrade pip - pip install flask selenium webdriver-manager beautifulsoup4 -) else ( - call venv\Scripts\activate.bat ) -:: Run the Flask server with logs redirected to server.log -echo Starting Flask server, logs will be written to server.log +:: Activate venv +call "venv\Scripts\activate.bat" + +echo ----------------------------------------- +echo Upgrading pip... +echo ----------------------------------------- +python -m pip install --upgrade pip + +echo ----------------------------------------- +echo Installing Python prerequisites... +echo ----------------------------------------- + +pip install flask selenium webdriver-manager beautifulsoup4 playwright + +echo ----------------------------------------- +echo Installing Playwright browser binaries... +echo ----------------------------------------- +playwright install || ( + echo ERROR: Playwright install failed! Trying chromium only... + playwright install chromium +) + +echo ----------------------------------------- +echo All dependencies installed successfully! +echo Launching server... +echo ----------------------------------------- + +:: Start Flask server with persistent window start "" cmd /k "venv\Scripts\python.exe scraper_service.py" endlocal diff --git a/scraper_service.py b/scraper_service.py index 5062540..9b8fb61 100644 --- a/scraper_service.py +++ b/scraper_service.py @@ -3,10 +3,11 @@ from playwright.sync_api import sync_playwright from bs4 import BeautifulSoup import urllib.parse import logging +import time app = Flask(__name__) -# Configure logging +# Logging logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s" @@ -24,144 +25,129 @@ def scrape_yahoo_options(symbol): browser = p.chromium.launch(headless=True) page = browser.new_page() page.set_extra_http_headers({ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118 Safari/537.36" + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36" + ) }) - # Avoid networkidle on Yahoo (it rarely goes “idle” because of ads/streaming) page.goto(url, wait_until="domcontentloaded", timeout=60000) app.logger.info("Page loaded (domcontentloaded) for %s", symbol) - # Wait for the options tables - page.wait_for_selector( - "section[data-testid='options-list-table'] table.yf-wurt5d", - timeout=30000 - ) - app.logger.info("Options tables located in DOM for %s", symbol) + # --- FIXED: Yahoo changed all classnames. We no longer depend on them. --- + # We simply wait until at least TWO tags appear. + app.logger.info("Waiting for options tables...") - # Grab CALLS and PUTS tables separately (first = Calls, second = Puts) - tables = page.evaluate(""" - () => { - const section = document.querySelector('section[data-testid="options-list-table"]'); - if (!section) return { calls: null, puts: null }; + # Wait for any table to exist + page.wait_for_selector("table", timeout=30000) - const tbs = section.querySelectorAll('table.yf-wurt5d'); - const getHTML = el => el ? el.outerHTML : null; + # Repeatedly check until 2 tables appear + for _ in range(30): # 30 × 1s = 30 seconds + tables = page.query_selector_all("table") + if len(tables) >= 2: + break + time.sleep(1) - return { - calls: getHTML(tbs[0] || null), - puts: getHTML(tbs[1] || null) - }; - } - """) + tables = page.query_selector_all("table") + if len(tables) < 2: + app.logger.error("Only %d tables found — expected 2. HTML changed?", len(tables)) + browser.close() + return {"error": "Could not locate options tables", "stock": symbol} - calls_html = tables.get("calls") if tables else None - puts_html = tables.get("puts") if tables else None + app.logger.info("Found %d tables. Extracting Calls & Puts.", len(tables)) - # Current price + calls_html = tables[0].evaluate("el => el.outerHTML") + puts_html = tables[1].evaluate("el => el.outerHTML") + + # --- Extract current price --- price = None try: - price_text = page.locator("span[data-testid='qsp-price']").inner_text() + # Primary selector + price_text = page.locator("fin-streamer[data-field='regularMarketPrice']").inner_text() price = float(price_text.replace(",", "")) - app.logger.info("Current price for %s = %s", symbol, price) - except Exception as e: - app.logger.warning("Failed to get current price for %s: %s", symbol, e) + except: + try: + # Fallback + price_text = page.locator("span[data-testid='qsp-price']").inner_text() + price = float(price_text.replace(",", "")) + except Exception as e: + app.logger.warning("Failed to extract price for %s: %s", symbol, e) + + app.logger.info("Current price for %s = %s", symbol, price) browser.close() - if not calls_html and not puts_html: - app.logger.error("Could not locate options tables for %s", symbol) - return {"error": "Could not locate options tables", "stock": symbol} - + # ---------------------------------------------------------------------- + # Parsing Table HTML + # ---------------------------------------------------------------------- def parse_table(table_html, side): if not table_html: - app.logger.warning("No %s table HTML present for %s", side, symbol) + app.logger.warning("No %s table HTML for %s", side, symbol) return [] soup = BeautifulSoup(table_html, "html.parser") + headers = [th.get_text(strip=True) for th in soup.select("thead th")] rows = soup.select("tbody tr") - parsed_rows = [] + parsed = [] for r in rows: - cols = r.find_all("td") - if len(cols) != len(headers): + tds = r.find_all("td") + if len(tds) != len(headers): continue - data = {} - for i, c in enumerate(cols): + item = {} + for i, c in enumerate(tds): key = headers[i] val = c.get_text(" ", strip=True) + # Convert numeric fields if key in ["Strike", "Last Price", "Bid", "Ask", "Change"]: try: val = float(val.replace(",", "")) - except Exception: + except: val = None elif key in ["Volume", "Open Interest"]: try: val = int(val.replace(",", "")) - except Exception: + except: val = None elif val in ["-", ""]: val = None - data[key] = val + item[key] = val - parsed_rows.append(data) + parsed.append(item) - app.logger.info("Parsed %d %s rows for %s", len(parsed_rows), side, symbol) - return parsed_rows + app.logger.info("Parsed %d %s rows", len(parsed), side) + return parsed calls_full = parse_table(calls_html, "calls") puts_full = parse_table(puts_html, "puts") - def rng(opts): - strikes = [r.get("Strike") for r in opts - if isinstance(r.get("Strike"), (int, float))] - return [min(strikes), max(strikes)] if strikes else [None, None] - + # ---------------------------------------------------------------------- + # Pruning logic + # ---------------------------------------------------------------------- def prune_nearest(options, price_value, limit=26, side=""): if price_value is None: - app.logger.info( - "No current price for %s; skipping pruning for %s (keeping %d rows)", - symbol, side, len(options) - ) return options, 0 - numeric_opts = [o for o in options if isinstance(o.get("Strike"), (int, float))] - if len(numeric_opts) <= limit: - app.logger.info( - "Not enough %s rows for pruning for %s: total=%d, limit=%d", - side, symbol, len(numeric_opts), limit - ) - return numeric_opts, 0 + numeric = [o for o in options if isinstance(o.get("Strike"), (int, float))] - sorted_opts = sorted( - numeric_opts, - key=lambda o: abs(o["Strike"] - price_value) - ) - pruned_list = sorted_opts[:limit] - pruned_count = len(options) - len(pruned_list) + if len(numeric) <= limit: + return numeric, 0 - app.logger.info( - "Pruned %s for %s: original=%d, kept=%d, pruned=%d (limit=%d)", - side, symbol, len(options), len(pruned_list), pruned_count, limit - ) - return pruned_list, pruned_count + sorted_opts = sorted(numeric, key=lambda x: abs(x["Strike"] - price_value)) + pruned = sorted_opts[:limit] + pruned_count = len(options) - len(pruned) + return pruned, pruned_count - # ✅ 26 closest by strike on each side - calls, pruned_calls_count = prune_nearest(calls_full, price, limit=26, side="calls") - puts, pruned_puts_count = prune_nearest(puts_full, price, limit=26, side="puts") + calls, pruned_calls = prune_nearest(calls_full, price, side="calls") + puts, pruned_puts = prune_nearest(puts_full, price, side="puts") - calls_range = rng(calls) - puts_range = rng(puts) - - app.logger.info( - "Final summary for %s: calls_kept=%d, puts_kept=%d, " - "calls_strike_range=%s, puts_strike_range=%s", - symbol, len(calls), len(puts), calls_range, puts_range - ) + def strike_range(opts): + strikes = [o["Strike"] for o in opts if isinstance(o.get("Strike"), (int, float))] + return [min(strikes), max(strikes)] if strikes else [None, None] return { "stock": symbol, @@ -169,20 +155,21 @@ def scrape_yahoo_options(symbol): "current_price": price, "calls": calls, "puts": puts, - "calls_strike_range": calls_range, - "puts_strike_range": puts_range, + "calls_strike_range": strike_range(calls), + "puts_strike_range": strike_range(puts), "total_calls": len(calls), "total_puts": len(puts), - "pruned_calls_count": pruned_calls_count, - "pruned_puts_count": pruned_puts_count, + "pruned_calls_count": pruned_calls, + "pruned_puts_count": pruned_puts, } + @app.route("/scrape_sync") def scrape_sync(): symbol = request.args.get("stock", "MSFT") app.logger.info("Received /scrape_sync request for symbol=%s", symbol) - data = scrape_yahoo_options(symbol) - return jsonify(data) + return jsonify(scrape_yahoo_options(symbol)) + if __name__ == "__main__": app.run(host="0.0.0.0", port=9777)