Fix expiration-specific options parsing

This commit is contained in:
2025-12-28 00:15:29 -08:00
parent 690887a6ec
commit 67b8fad423
2 changed files with 805 additions and 413 deletions

View File

@@ -4,6 +4,7 @@ from bs4 import BeautifulSoup
from datetime import datetime, timezone
import urllib.parse
import logging
import json
import re
import time
@@ -45,6 +46,159 @@ def format_expiration_label(timestamp):
return str(timestamp)
def format_percent(value):
if value is None:
return None
try:
return f"{value * 100:.2f}%"
except Exception:
return None
def extract_raw_value(value):
if isinstance(value, dict):
return value.get("raw")
return value
def extract_fmt_value(value):
if isinstance(value, dict):
return value.get("fmt")
return None
def format_percent_value(value):
fmt = extract_fmt_value(value)
if fmt is not None:
return fmt
return format_percent(extract_raw_value(value))
def format_last_trade_date(timestamp):
timestamp = extract_raw_value(timestamp)
if not timestamp:
return None
try:
return datetime.fromtimestamp(timestamp).strftime("%m/%d/%Y %I:%M %p") + " EST"
except Exception:
return None
def extract_option_chain_from_html(html):
if not html:
return None
token = "\"body\":\""
start = 0
while True:
idx = html.find(token, start)
if idx == -1:
break
i = idx + len(token)
escaped = False
raw_chars = []
while i < len(html):
ch = html[i]
if escaped:
raw_chars.append(ch)
escaped = False
else:
if ch == "\\":
raw_chars.append(ch)
escaped = True
elif ch == "\"":
break
else:
raw_chars.append(ch)
i += 1
raw = "".join(raw_chars)
try:
body_text = json.loads(f"\"{raw}\"")
except json.JSONDecodeError:
start = idx + len(token)
continue
if "optionChain" not in body_text:
start = idx + len(token)
continue
try:
payload = json.loads(body_text)
except json.JSONDecodeError:
start = idx + len(token)
continue
option_chain = payload.get("optionChain")
if option_chain and option_chain.get("result"):
return option_chain
start = idx + len(token)
return None
def extract_expiration_dates_from_chain(chain):
if not chain:
return []
result = chain.get("result", [])
if not result:
return []
return result[0].get("expirationDates", []) or []
def normalize_chain_rows(rows):
normalized = []
for row in rows or []:
normalized.append(
{
"Contract Name": row.get("contractSymbol"),
"Last Trade Date (EST)": format_last_trade_date(
row.get("lastTradeDate")
),
"Strike": extract_raw_value(row.get("strike")),
"Last Price": extract_raw_value(row.get("lastPrice")),
"Bid": extract_raw_value(row.get("bid")),
"Ask": extract_raw_value(row.get("ask")),
"Change": extract_raw_value(row.get("change")),
"% Change": format_percent_value(row.get("percentChange")),
"Volume": extract_raw_value(row.get("volume")),
"Open Interest": extract_raw_value(row.get("openInterest")),
"Implied Volatility": format_percent_value(
row.get("impliedVolatility")
),
}
)
return normalized
def build_rows_from_chain(chain):
result = chain.get("result", []) if chain else []
if not result:
return [], []
options = result[0].get("options", [])
if not options:
return [], []
option = options[0]
return (
normalize_chain_rows(option.get("calls")),
normalize_chain_rows(option.get("puts")),
)
def extract_contract_expiry_code(contract_name):
if not contract_name:
return None
match = re.search(r"(\d{6})", contract_name)
return match.group(1) if match else None
def expected_expiry_code(timestamp):
if not timestamp:
return None
try:
return datetime.utcfromtimestamp(timestamp).strftime("%y%m%d")
except Exception:
return None
def extract_expiration_dates_from_html(html):
if not html:
return []
@@ -146,6 +300,66 @@ def wait_for_tables(page):
def scrape_yahoo_options(symbol, expiration=None):
def parse_table(table_html, side):
if not table_html:
app.logger.warning("No %s table HTML for %s", side, symbol)
return []
soup = BeautifulSoup(table_html, "html.parser")
headers = [th.get_text(strip=True) for th in soup.select("thead th")]
rows = soup.select("tbody tr")
parsed = []
for r in rows:
tds = r.find_all("td")
if len(tds) != len(headers):
continue
item = {}
for i, c in enumerate(tds):
key = headers[i]
val = c.get_text(" ", strip=True)
# Convert numeric fields
if key in ["Strike", "Last Price", "Bid", "Ask", "Change"]:
try:
val = float(val.replace(",", ""))
except Exception:
val = None
elif key in ["Volume", "Open Interest"]:
try:
val = int(val.replace(",", ""))
except Exception:
val = None
elif val in ["-", ""]:
val = None
item[key] = val
parsed.append(item)
app.logger.info("Parsed %d %s rows", len(parsed), side)
return parsed
def read_option_chain(page):
html = page.content()
option_chain = extract_option_chain_from_html(html)
if option_chain:
expiration_dates = extract_expiration_dates_from_chain(option_chain)
else:
expiration_dates = extract_expiration_dates_from_html(html)
return option_chain, expiration_dates
def has_expected_expiry(options, expected_code):
if not expected_code:
return False
for row in options or []:
name = row.get("Contract Name")
if extract_contract_expiry_code(name) == expected_code:
return True
return False
encoded = urllib.parse.quote(symbol, safe="")
base_url = f"https://finance.yahoo.com/quote/{encoded}/options/"
requested_expiration = expiration.strip() if expiration else None
@@ -162,6 +376,8 @@ def scrape_yahoo_options(symbol, expiration=None):
calls_html = None
puts_html = None
calls_full = []
puts_full = []
price = None
selected_expiration_value = None
selected_expiration_label = None
@@ -210,8 +426,8 @@ def scrape_yahoo_options(symbol, expiration=None):
page.goto(url, wait_until="domcontentloaded", timeout=60000)
app.logger.info("Page loaded (domcontentloaded) for %s", symbol)
html = page.content()
expiration_dates = extract_expiration_dates_from_html(html)
option_chain, expiration_dates = read_option_chain(page)
app.logger.info("Option chain found: %s", bool(option_chain))
expiration_options = build_expiration_options(expiration_dates)
if fallback_to_base:
@@ -238,8 +454,7 @@ def scrape_yahoo_options(symbol, expiration=None):
page.goto(url, wait_until="domcontentloaded", timeout=60000)
app.logger.info("Page loaded (domcontentloaded) for %s", symbol)
html = page.content()
expiration_dates = extract_expiration_dates_from_html(html)
option_chain, expiration_dates = read_option_chain(page)
expiration_options = build_expiration_options(expiration_dates)
if target_date and expiration_options:
@@ -258,25 +473,34 @@ def scrape_yahoo_options(symbol, expiration=None):
for opt in expiration_options
],
}
selected_expiration_value = matched.get("value")
selected_expiration_label = matched.get("label")
elif expiration_options and not target_date:
selected_expiration_value = expiration_options[0].get("value")
selected_expiration_label = expiration_options[0].get("label")
app.logger.info("Waiting for options tables...")
calls_full, puts_full = build_rows_from_chain(option_chain)
app.logger.info(
"Option chain rows: calls=%d puts=%d",
len(calls_full),
len(puts_full),
)
tables = wait_for_tables(page)
if len(tables) < 2:
app.logger.error(
"Only %d tables found; expected 2. HTML may have changed.",
len(tables),
)
return {"error": "Could not locate options tables", "stock": symbol}
if not calls_full and not puts_full:
app.logger.info("Waiting for options tables...")
app.logger.info("Found %d tables. Extracting Calls & Puts.", len(tables))
tables = wait_for_tables(page)
if len(tables) < 2:
app.logger.error(
"Only %d tables found; expected 2. HTML may have changed.",
len(tables),
)
return {"error": "Could not locate options tables", "stock": symbol}
calls_html = tables[0].evaluate("el => el.outerHTML")
puts_html = tables[1].evaluate("el => el.outerHTML")
app.logger.info("Found %d tables. Extracting Calls & Puts.", len(tables))
calls_html = tables[0].evaluate("el => el.outerHTML")
puts_html = tables[1].evaluate("el => el.outerHTML")
# --- Extract current price ---
try:
@@ -297,53 +521,25 @@ def scrape_yahoo_options(symbol, expiration=None):
finally:
browser.close()
# ----------------------------------------------------------------------
# Parsing Table HTML
# ----------------------------------------------------------------------
def parse_table(table_html, side):
if not table_html:
app.logger.warning("No %s table HTML for %s", side, symbol)
return []
if not calls_full and not puts_full and calls_html and puts_html:
calls_full = parse_table(calls_html, "calls")
puts_full = parse_table(puts_html, "puts")
soup = BeautifulSoup(table_html, "html.parser")
headers = [th.get_text(strip=True) for th in soup.select("thead th")]
rows = soup.select("tbody tr")
parsed = []
for r in rows:
tds = r.find_all("td")
if len(tds) != len(headers):
continue
item = {}
for i, c in enumerate(tds):
key = headers[i]
val = c.get_text(" ", strip=True)
# Convert numeric fields
if key in ["Strike", "Last Price", "Bid", "Ask", "Change"]:
try:
val = float(val.replace(",", ""))
except Exception:
val = None
elif key in ["Volume", "Open Interest"]:
try:
val = int(val.replace(",", ""))
except Exception:
val = None
elif val in ["-", ""]:
val = None
item[key] = val
parsed.append(item)
app.logger.info("Parsed %d %s rows", len(parsed), side)
return parsed
calls_full = parse_table(calls_html, "calls")
puts_full = parse_table(puts_html, "puts")
expected_code = expected_expiry_code(target_date)
if expected_code:
if not has_expected_expiry(calls_full, expected_code) and not has_expected_expiry(
puts_full, expected_code
):
return {
"error": "Options chain does not match requested expiration",
"stock": symbol,
"requested_expiration": requested_expiration,
"expected_expiration_code": expected_code,
"selected_expiration": {
"value": selected_expiration_value,
"label": selected_expiration_label,
},
}
# ----------------------------------------------------------------------
# Pruning logic