MidasEngine/Data-Collection/WebScraper/scrapers/oil_news_scraper.py

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import os

# URL for OilPrice.com homepage
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"

# Set up the data directory
DATA_DIR = os.path.join(os.getcwd(), "data")
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

def scrape_oil_news():
    print("Scraping oil market news using Selenium...")

    # Set up Selenium options
    options = Options()
    options.headless = True
    driver = webdriver.Firefox(options=options)

    driver.get(OIL_NEWS_URL)

    # Wait until 'categoryArticle' elements load
    try:
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
        )
    except Exception as e:
        print("Error: Content did not load properly.")
        driver.quit()
        return pd.DataFrame()

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    # Parse the articles
    articles = soup.find_all('div', class_='categoryArticle')
    news_data = []

    print(f"Found {len(articles)} articles.")

    for i, article in enumerate(articles):
        # Extract the title, link, and date using the adjusted structure
        headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
        link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
        date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None

        # Log each article's details for debugging
        print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}")

        # Only add valid entries
        if headline and link and date:
            news_data.append({
                'headline': headline,
                'link': link,  # Assuming the link is already a full URL
                'date': date
            })

    df = pd.DataFrame(news_data)
    return df

def run_scraper():
    news_df = scrape_oil_news()
    file_path = os.path.join(DATA_DIR, 'oil_news.csv')

    if not news_df.empty:
        news_df.to_csv(file_path, index=False)
        print(f"Oil news data saved to {file_path}")
    else:
        print("No data was scraped. The CSV file is empty.")