Files
SimpleScraper/scraper_service.py

189 lines
6.5 KiB
Python

from flask import Flask, jsonify, request
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import urllib.parse
import logging
app = Flask(__name__)
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s"
)
app.logger.setLevel(logging.INFO)
def scrape_yahoo_options(symbol):
encoded = urllib.parse.quote(symbol, safe="")
url = f"https://finance.yahoo.com/quote/{encoded}/options/"
app.logger.info("Starting scrape for symbol=%s url=%s", symbol, url)
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.set_extra_http_headers({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118 Safari/537.36"
})
# Avoid networkidle on Yahoo (it rarely goes “idle” because of ads/streaming)
page.goto(url, wait_until="domcontentloaded", timeout=60000)
app.logger.info("Page loaded (domcontentloaded) for %s", symbol)
# Wait for the options tables
page.wait_for_selector(
"section[data-testid='options-list-table'] table.yf-wurt5d",
timeout=30000
)
app.logger.info("Options tables located in DOM for %s", symbol)
# Grab CALLS and PUTS tables separately (first = Calls, second = Puts)
tables = page.evaluate("""
() => {
const section = document.querySelector('section[data-testid="options-list-table"]');
if (!section) return { calls: null, puts: null };
const tbs = section.querySelectorAll('table.yf-wurt5d');
const getHTML = el => el ? el.outerHTML : null;
return {
calls: getHTML(tbs[0] || null),
puts: getHTML(tbs[1] || null)
};
}
""")
calls_html = tables.get("calls") if tables else None
puts_html = tables.get("puts") if tables else None
# Current price
price = None
try:
price_text = page.locator("span[data-testid='qsp-price']").inner_text()
price = float(price_text.replace(",", ""))
app.logger.info("Current price for %s = %s", symbol, price)
except Exception as e:
app.logger.warning("Failed to get current price for %s: %s", symbol, e)
browser.close()
if not calls_html and not puts_html:
app.logger.error("Could not locate options tables for %s", symbol)
return {"error": "Could not locate options tables", "stock": symbol}
def parse_table(table_html, side):
if not table_html:
app.logger.warning("No %s table HTML present for %s", side, symbol)
return []
soup = BeautifulSoup(table_html, "html.parser")
headers = [th.get_text(strip=True) for th in soup.select("thead th")]
rows = soup.select("tbody tr")
parsed_rows = []
for r in rows:
cols = r.find_all("td")
if len(cols) != len(headers):
continue
data = {}
for i, c in enumerate(cols):
key = headers[i]
val = c.get_text(" ", strip=True)
if key in ["Strike", "Last Price", "Bid", "Ask", "Change"]:
try:
val = float(val.replace(",", ""))
except Exception:
val = None
elif key in ["Volume", "Open Interest"]:
try:
val = int(val.replace(",", ""))
except Exception:
val = None
elif val in ["-", ""]:
val = None
data[key] = val
parsed_rows.append(data)
app.logger.info("Parsed %d %s rows for %s", len(parsed_rows), side, symbol)
return parsed_rows
calls_full = parse_table(calls_html, "calls")
puts_full = parse_table(puts_html, "puts")
def rng(opts):
strikes = [r.get("Strike") for r in opts
if isinstance(r.get("Strike"), (int, float))]
return [min(strikes), max(strikes)] if strikes else [None, None]
def prune_nearest(options, price_value, limit=26, side=""):
if price_value is None:
app.logger.info(
"No current price for %s; skipping pruning for %s (keeping %d rows)",
symbol, side, len(options)
)
return options, 0
numeric_opts = [o for o in options if isinstance(o.get("Strike"), (int, float))]
if len(numeric_opts) <= limit:
app.logger.info(
"Not enough %s rows for pruning for %s: total=%d, limit=%d",
side, symbol, len(numeric_opts), limit
)
return numeric_opts, 0
sorted_opts = sorted(
numeric_opts,
key=lambda o: abs(o["Strike"] - price_value)
)
pruned_list = sorted_opts[:limit]
pruned_count = len(options) - len(pruned_list)
app.logger.info(
"Pruned %s for %s: original=%d, kept=%d, pruned=%d (limit=%d)",
side, symbol, len(options), len(pruned_list), pruned_count, limit
)
return pruned_list, pruned_count
# ✅ 26 closest by strike on each side
calls, pruned_calls_count = prune_nearest(calls_full, price, limit=26, side="calls")
puts, pruned_puts_count = prune_nearest(puts_full, price, limit=26, side="puts")
calls_range = rng(calls)
puts_range = rng(puts)
app.logger.info(
"Final summary for %s: calls_kept=%d, puts_kept=%d, "
"calls_strike_range=%s, puts_strike_range=%s",
symbol, len(calls), len(puts), calls_range, puts_range
)
return {
"stock": symbol,
"url": url,
"current_price": price,
"calls": calls,
"puts": puts,
"calls_strike_range": calls_range,
"puts_strike_range": puts_range,
"total_calls": len(calls),
"total_puts": len(puts),
"pruned_calls_count": pruned_calls_count,
"pruned_puts_count": pruned_puts_count,
}
@app.route("/scrape_sync")
def scrape_sync():
symbol = request.args.get("stock", "MSFT")
app.logger.info("Received /scrape_sync request for symbol=%s", symbol)
data = scrape_yahoo_options(symbol)
return jsonify(data)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=9777)