Fix expiration-specific options parsing
This commit is contained in:
@@ -4,6 +4,7 @@ from bs4 import BeautifulSoup
|
||||
from datetime import datetime, timezone
|
||||
import urllib.parse
|
||||
import logging
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
|
||||
@@ -45,6 +46,159 @@ def format_expiration_label(timestamp):
|
||||
return str(timestamp)
|
||||
|
||||
|
||||
def format_percent(value):
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
return f"{value * 100:.2f}%"
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def extract_raw_value(value):
|
||||
if isinstance(value, dict):
|
||||
return value.get("raw")
|
||||
return value
|
||||
|
||||
|
||||
def extract_fmt_value(value):
|
||||
if isinstance(value, dict):
|
||||
return value.get("fmt")
|
||||
return None
|
||||
|
||||
|
||||
def format_percent_value(value):
|
||||
fmt = extract_fmt_value(value)
|
||||
if fmt is not None:
|
||||
return fmt
|
||||
return format_percent(extract_raw_value(value))
|
||||
|
||||
|
||||
def format_last_trade_date(timestamp):
|
||||
timestamp = extract_raw_value(timestamp)
|
||||
if not timestamp:
|
||||
return None
|
||||
try:
|
||||
return datetime.fromtimestamp(timestamp).strftime("%m/%d/%Y %I:%M %p") + " EST"
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def extract_option_chain_from_html(html):
|
||||
if not html:
|
||||
return None
|
||||
|
||||
token = "\"body\":\""
|
||||
start = 0
|
||||
while True:
|
||||
idx = html.find(token, start)
|
||||
if idx == -1:
|
||||
break
|
||||
i = idx + len(token)
|
||||
escaped = False
|
||||
raw_chars = []
|
||||
while i < len(html):
|
||||
ch = html[i]
|
||||
if escaped:
|
||||
raw_chars.append(ch)
|
||||
escaped = False
|
||||
else:
|
||||
if ch == "\\":
|
||||
raw_chars.append(ch)
|
||||
escaped = True
|
||||
elif ch == "\"":
|
||||
break
|
||||
else:
|
||||
raw_chars.append(ch)
|
||||
i += 1
|
||||
raw = "".join(raw_chars)
|
||||
try:
|
||||
body_text = json.loads(f"\"{raw}\"")
|
||||
except json.JSONDecodeError:
|
||||
start = idx + len(token)
|
||||
continue
|
||||
if "optionChain" not in body_text:
|
||||
start = idx + len(token)
|
||||
continue
|
||||
try:
|
||||
payload = json.loads(body_text)
|
||||
except json.JSONDecodeError:
|
||||
start = idx + len(token)
|
||||
continue
|
||||
option_chain = payload.get("optionChain")
|
||||
if option_chain and option_chain.get("result"):
|
||||
return option_chain
|
||||
|
||||
start = idx + len(token)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_expiration_dates_from_chain(chain):
|
||||
if not chain:
|
||||
return []
|
||||
|
||||
result = chain.get("result", [])
|
||||
if not result:
|
||||
return []
|
||||
return result[0].get("expirationDates", []) or []
|
||||
|
||||
|
||||
def normalize_chain_rows(rows):
|
||||
normalized = []
|
||||
for row in rows or []:
|
||||
normalized.append(
|
||||
{
|
||||
"Contract Name": row.get("contractSymbol"),
|
||||
"Last Trade Date (EST)": format_last_trade_date(
|
||||
row.get("lastTradeDate")
|
||||
),
|
||||
"Strike": extract_raw_value(row.get("strike")),
|
||||
"Last Price": extract_raw_value(row.get("lastPrice")),
|
||||
"Bid": extract_raw_value(row.get("bid")),
|
||||
"Ask": extract_raw_value(row.get("ask")),
|
||||
"Change": extract_raw_value(row.get("change")),
|
||||
"% Change": format_percent_value(row.get("percentChange")),
|
||||
"Volume": extract_raw_value(row.get("volume")),
|
||||
"Open Interest": extract_raw_value(row.get("openInterest")),
|
||||
"Implied Volatility": format_percent_value(
|
||||
row.get("impliedVolatility")
|
||||
),
|
||||
}
|
||||
)
|
||||
return normalized
|
||||
|
||||
|
||||
def build_rows_from_chain(chain):
|
||||
result = chain.get("result", []) if chain else []
|
||||
if not result:
|
||||
return [], []
|
||||
options = result[0].get("options", [])
|
||||
if not options:
|
||||
return [], []
|
||||
option = options[0]
|
||||
return (
|
||||
normalize_chain_rows(option.get("calls")),
|
||||
normalize_chain_rows(option.get("puts")),
|
||||
)
|
||||
|
||||
|
||||
def extract_contract_expiry_code(contract_name):
|
||||
if not contract_name:
|
||||
return None
|
||||
match = re.search(r"(\d{6})", contract_name)
|
||||
return match.group(1) if match else None
|
||||
|
||||
|
||||
def expected_expiry_code(timestamp):
|
||||
if not timestamp:
|
||||
return None
|
||||
try:
|
||||
return datetime.utcfromtimestamp(timestamp).strftime("%y%m%d")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def extract_expiration_dates_from_html(html):
|
||||
if not html:
|
||||
return []
|
||||
@@ -146,6 +300,66 @@ def wait_for_tables(page):
|
||||
|
||||
|
||||
def scrape_yahoo_options(symbol, expiration=None):
|
||||
def parse_table(table_html, side):
|
||||
if not table_html:
|
||||
app.logger.warning("No %s table HTML for %s", side, symbol)
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(table_html, "html.parser")
|
||||
|
||||
headers = [th.get_text(strip=True) for th in soup.select("thead th")]
|
||||
rows = soup.select("tbody tr")
|
||||
|
||||
parsed = []
|
||||
for r in rows:
|
||||
tds = r.find_all("td")
|
||||
if len(tds) != len(headers):
|
||||
continue
|
||||
|
||||
item = {}
|
||||
for i, c in enumerate(tds):
|
||||
key = headers[i]
|
||||
val = c.get_text(" ", strip=True)
|
||||
|
||||
# Convert numeric fields
|
||||
if key in ["Strike", "Last Price", "Bid", "Ask", "Change"]:
|
||||
try:
|
||||
val = float(val.replace(",", ""))
|
||||
except Exception:
|
||||
val = None
|
||||
elif key in ["Volume", "Open Interest"]:
|
||||
try:
|
||||
val = int(val.replace(",", ""))
|
||||
except Exception:
|
||||
val = None
|
||||
elif val in ["-", ""]:
|
||||
val = None
|
||||
|
||||
item[key] = val
|
||||
|
||||
parsed.append(item)
|
||||
|
||||
app.logger.info("Parsed %d %s rows", len(parsed), side)
|
||||
return parsed
|
||||
|
||||
def read_option_chain(page):
|
||||
html = page.content()
|
||||
option_chain = extract_option_chain_from_html(html)
|
||||
if option_chain:
|
||||
expiration_dates = extract_expiration_dates_from_chain(option_chain)
|
||||
else:
|
||||
expiration_dates = extract_expiration_dates_from_html(html)
|
||||
return option_chain, expiration_dates
|
||||
|
||||
def has_expected_expiry(options, expected_code):
|
||||
if not expected_code:
|
||||
return False
|
||||
for row in options or []:
|
||||
name = row.get("Contract Name")
|
||||
if extract_contract_expiry_code(name) == expected_code:
|
||||
return True
|
||||
return False
|
||||
|
||||
encoded = urllib.parse.quote(symbol, safe="")
|
||||
base_url = f"https://finance.yahoo.com/quote/{encoded}/options/"
|
||||
requested_expiration = expiration.strip() if expiration else None
|
||||
@@ -162,6 +376,8 @@ def scrape_yahoo_options(symbol, expiration=None):
|
||||
|
||||
calls_html = None
|
||||
puts_html = None
|
||||
calls_full = []
|
||||
puts_full = []
|
||||
price = None
|
||||
selected_expiration_value = None
|
||||
selected_expiration_label = None
|
||||
@@ -210,8 +426,8 @@ def scrape_yahoo_options(symbol, expiration=None):
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
app.logger.info("Page loaded (domcontentloaded) for %s", symbol)
|
||||
|
||||
html = page.content()
|
||||
expiration_dates = extract_expiration_dates_from_html(html)
|
||||
option_chain, expiration_dates = read_option_chain(page)
|
||||
app.logger.info("Option chain found: %s", bool(option_chain))
|
||||
expiration_options = build_expiration_options(expiration_dates)
|
||||
|
||||
if fallback_to_base:
|
||||
@@ -238,8 +454,7 @@ def scrape_yahoo_options(symbol, expiration=None):
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
app.logger.info("Page loaded (domcontentloaded) for %s", symbol)
|
||||
|
||||
html = page.content()
|
||||
expiration_dates = extract_expiration_dates_from_html(html)
|
||||
option_chain, expiration_dates = read_option_chain(page)
|
||||
expiration_options = build_expiration_options(expiration_dates)
|
||||
|
||||
if target_date and expiration_options:
|
||||
@@ -258,25 +473,34 @@ def scrape_yahoo_options(symbol, expiration=None):
|
||||
for opt in expiration_options
|
||||
],
|
||||
}
|
||||
selected_expiration_value = matched.get("value")
|
||||
selected_expiration_label = matched.get("label")
|
||||
elif expiration_options and not target_date:
|
||||
selected_expiration_value = expiration_options[0].get("value")
|
||||
selected_expiration_label = expiration_options[0].get("label")
|
||||
|
||||
app.logger.info("Waiting for options tables...")
|
||||
calls_full, puts_full = build_rows_from_chain(option_chain)
|
||||
app.logger.info(
|
||||
"Option chain rows: calls=%d puts=%d",
|
||||
len(calls_full),
|
||||
len(puts_full),
|
||||
)
|
||||
|
||||
tables = wait_for_tables(page)
|
||||
if len(tables) < 2:
|
||||
app.logger.error(
|
||||
"Only %d tables found; expected 2. HTML may have changed.",
|
||||
len(tables),
|
||||
)
|
||||
return {"error": "Could not locate options tables", "stock": symbol}
|
||||
if not calls_full and not puts_full:
|
||||
app.logger.info("Waiting for options tables...")
|
||||
|
||||
app.logger.info("Found %d tables. Extracting Calls & Puts.", len(tables))
|
||||
tables = wait_for_tables(page)
|
||||
if len(tables) < 2:
|
||||
app.logger.error(
|
||||
"Only %d tables found; expected 2. HTML may have changed.",
|
||||
len(tables),
|
||||
)
|
||||
return {"error": "Could not locate options tables", "stock": symbol}
|
||||
|
||||
calls_html = tables[0].evaluate("el => el.outerHTML")
|
||||
puts_html = tables[1].evaluate("el => el.outerHTML")
|
||||
app.logger.info("Found %d tables. Extracting Calls & Puts.", len(tables))
|
||||
|
||||
calls_html = tables[0].evaluate("el => el.outerHTML")
|
||||
puts_html = tables[1].evaluate("el => el.outerHTML")
|
||||
|
||||
# --- Extract current price ---
|
||||
try:
|
||||
@@ -297,53 +521,25 @@ def scrape_yahoo_options(symbol, expiration=None):
|
||||
finally:
|
||||
browser.close()
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Parsing Table HTML
|
||||
# ----------------------------------------------------------------------
|
||||
def parse_table(table_html, side):
|
||||
if not table_html:
|
||||
app.logger.warning("No %s table HTML for %s", side, symbol)
|
||||
return []
|
||||
if not calls_full and not puts_full and calls_html and puts_html:
|
||||
calls_full = parse_table(calls_html, "calls")
|
||||
puts_full = parse_table(puts_html, "puts")
|
||||
|
||||
soup = BeautifulSoup(table_html, "html.parser")
|
||||
|
||||
headers = [th.get_text(strip=True) for th in soup.select("thead th")]
|
||||
rows = soup.select("tbody tr")
|
||||
|
||||
parsed = []
|
||||
for r in rows:
|
||||
tds = r.find_all("td")
|
||||
if len(tds) != len(headers):
|
||||
continue
|
||||
|
||||
item = {}
|
||||
for i, c in enumerate(tds):
|
||||
key = headers[i]
|
||||
val = c.get_text(" ", strip=True)
|
||||
|
||||
# Convert numeric fields
|
||||
if key in ["Strike", "Last Price", "Bid", "Ask", "Change"]:
|
||||
try:
|
||||
val = float(val.replace(",", ""))
|
||||
except Exception:
|
||||
val = None
|
||||
elif key in ["Volume", "Open Interest"]:
|
||||
try:
|
||||
val = int(val.replace(",", ""))
|
||||
except Exception:
|
||||
val = None
|
||||
elif val in ["-", ""]:
|
||||
val = None
|
||||
|
||||
item[key] = val
|
||||
|
||||
parsed.append(item)
|
||||
|
||||
app.logger.info("Parsed %d %s rows", len(parsed), side)
|
||||
return parsed
|
||||
|
||||
calls_full = parse_table(calls_html, "calls")
|
||||
puts_full = parse_table(puts_html, "puts")
|
||||
expected_code = expected_expiry_code(target_date)
|
||||
if expected_code:
|
||||
if not has_expected_expiry(calls_full, expected_code) and not has_expected_expiry(
|
||||
puts_full, expected_code
|
||||
):
|
||||
return {
|
||||
"error": "Options chain does not match requested expiration",
|
||||
"stock": symbol,
|
||||
"requested_expiration": requested_expiration,
|
||||
"expected_expiration_code": expected_code,
|
||||
"selected_expiration": {
|
||||
"value": selected_expiration_value,
|
||||
"label": selected_expiration_label,
|
||||
},
|
||||
}
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Pruning logic
|
||||
|
||||
Reference in New Issue
Block a user