MidasEngine/Data-Collection/WebScraper/scrapers/oil_news_scraper.py

# scrapers/oil_news_scraper.py

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

# URL for OilPrice.com homepage
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"

# Define the directory to store the scraped data
DATA_DIR = os.path.join(os.getcwd(), "data")
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

# Function to scrape news headlines from OilPrice.com
def scrape_oil_news():
    print("Scraping oil market news...")

    # Send an HTTP request to the website
    response = requests.get(OIL_NEWS_URL)
    response.raise_for_status()

    # Print the HTML to see what we are working with
    print(response.text[:1000])  # Print only the first 1000 characters for brevity

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all news article containers (class names updated)
    articles = soup.find_all('div', class_='categoryArticle')

    # List to store the scraped data
    news_data = []

    # Loop through each article container
    for article in articles:
        # Extract the headline, date, and link
        headline = article.find('a').get_text(strip=True) if article.find('a') else None
        link = article.find('a')['href'] if article.find('a') else None
        date = article.find('span', class_='categoryArticle__date').get_text(strip=True) if article.find('span', class_='categoryArticle__date') else None

        # Only append valid data
        if headline and link and date:
            news_data.append({
                'headline': headline,
                'link': f"https://oilprice.com{link}",
                'date': date
            })

    df = pd.DataFrame(news_data)
    return df

# Function to run the scraper and save data
def run_scraper():
    # Scrape oil news
    news_df = scrape_oil_news()

    # Define the file path for saving the data
    file_path = os.path.join(DATA_DIR, 'oil_news.csv')

    # Save the DataFrame to a CSV file
    if not news_df.empty:
        news_df.to_csv(file_path, index=False)
        print(f"Oil news data saved to {file_path}")
    else:
        print("No data was scraped. The CSV file is empty.")