Initial commit with code
This commit is contained in:
143
scraper_service(works).py
Normal file
143
scraper_service(works).py
Normal file
@@ -0,0 +1,143 @@
|
||||
import threading
|
||||
from flask import Flask, jsonify, send_file
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Global variables to store scrape status and processed data
|
||||
SCRAPE_STATUS = {"done": False, "error": None}
|
||||
PROCESSED_DATA = []
|
||||
|
||||
def run_selenium_scrape():
|
||||
global SCRAPE_STATUS
|
||||
global PROCESSED_DATA
|
||||
SCRAPE_STATUS = {"done": False, "error": None}
|
||||
PROCESSED_DATA = [] # Clear previous data
|
||||
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
chrome_options.add_argument("--start-maximized")
|
||||
# HEADFUL: do NOT use --headless
|
||||
|
||||
chrome_options.add_argument(
|
||||
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
)
|
||||
|
||||
service = Service(ChromeDriverManager().install())
|
||||
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||
|
||||
try:
|
||||
driver.get("https://finance.yahoo.com/quote/%5ESPX/options/")
|
||||
|
||||
# Optional: click accept on consent popup if present
|
||||
try:
|
||||
consent_btn = WebDriverWait(driver, 5).until(
|
||||
EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'Accept')]"))
|
||||
)
|
||||
consent_btn.click()
|
||||
except:
|
||||
pass # No consent popup, ignore
|
||||
|
||||
# Wait for the options table
|
||||
WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, "section[data-testid='options-list-table']")
|
||||
)
|
||||
)
|
||||
|
||||
html = driver.page_source
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
section = soup.find("section", {"data-testid": "options-list-table"})
|
||||
if section:
|
||||
# Extract headers
|
||||
headers = [th.get_text(strip=True) for th in section.find('thead').find_all('th')]
|
||||
|
||||
# Extract rows
|
||||
rows = section.find('tbody').find_all('tr')
|
||||
|
||||
cleaned_data = []
|
||||
for row in rows:
|
||||
cols = row.find_all('td')
|
||||
row_data = {}
|
||||
for i, col in enumerate(cols):
|
||||
# Clean text, remove extra spans and strip whitespace
|
||||
value = col.get_text(separator=' ', strip=True).replace('<span class="yf-wurt5d"></span>', '').strip()
|
||||
|
||||
# Convert to appropriate types and handle 'nil' values
|
||||
if headers[i] == 'Strike' or headers[i] == 'Last Price' or headers[i] == 'Bid' or headers[i] == 'Ask' or headers[i] == 'Change':
|
||||
try:
|
||||
value = float(value)
|
||||
except ValueError:
|
||||
value = None # Set to None for empty/nil values
|
||||
elif headers[i] == 'Volume' or headers[i] == 'Open Interest':
|
||||
try:
|
||||
value = int(value)
|
||||
except ValueError:
|
||||
value = None # Set to None for empty/nil values
|
||||
elif value == '-' or value == '':
|
||||
value = None # Explicitly handle '-' and empty strings as None
|
||||
|
||||
if value is not None: # Only include non-empty/non-nil values
|
||||
row_data[headers[i]] = value
|
||||
|
||||
if row_data: # Only add row if it contains any data after cleaning
|
||||
cleaned_data.append(row_data)
|
||||
|
||||
PROCESSED_DATA = cleaned_data
|
||||
else:
|
||||
PROCESSED_DATA = []
|
||||
|
||||
SCRAPE_STATUS = {"done": True, "error": None}
|
||||
|
||||
except Exception as e:
|
||||
SCRAPE_STATUS = {"done": False, "error": str(e)}
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
# Option 1: synchronous scrape - request waits for scrape to finish
|
||||
@app.route('/scrape_sync', methods=['GET'])
|
||||
def scrape_sync():
|
||||
run_selenium_scrape()
|
||||
if SCRAPE_STATUS["done"]:
|
||||
return jsonify(PROCESSED_DATA)
|
||||
else:
|
||||
return jsonify({"error": SCRAPE_STATUS["error"]}), 500
|
||||
|
||||
# Option 2: threaded scrape + join - start thread, then wait for it in request
|
||||
@app.route('/scrape_threaded', methods=['GET'])
|
||||
def scrape_threaded():
|
||||
thread = threading.Thread(target=run_selenium_scrape)
|
||||
thread.start()
|
||||
thread.join() # wait for scraping to finish
|
||||
|
||||
if SCRAPE_STATUS["done"]:
|
||||
return jsonify(PROCESSED_DATA)
|
||||
else:
|
||||
return jsonify({"error": SCRAPE_STATUS["error"]}), 500
|
||||
|
||||
# Your existing endpoints to check status or get result directly
|
||||
@app.route('/status', methods=['GET'])
|
||||
def status():
|
||||
return jsonify(SCRAPE_STATUS)
|
||||
|
||||
@app.route('/result', methods=['GET'])
|
||||
def result():
|
||||
# This endpoint can now return the processed JSON data if a scrape was successful
|
||||
if SCRAPE_STATUS["done"]:
|
||||
return jsonify(PROCESSED_DATA)
|
||||
else:
|
||||
return jsonify({"error": "No data available or scrape not yet complete. Run /scrape_sync or /scrape_threaded first."}), 404
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0", port=8000)
|
||||
Reference in New Issue
Block a user