from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import pandas as pd import os # URL for OilPrice.com homepage OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/" # Set up the data directory DATA_DIR = os.path.join(os.getcwd(), "data") if not os.path.exists(DATA_DIR): os.makedirs(DATA_DIR) def scrape_oil_news(): print("Scraping oil market news using Selenium...") # Set up Selenium options options = Options() options.headless = True driver = webdriver.Firefox(options=options) driver.get(OIL_NEWS_URL) # Wait until 'categoryArticle' elements load try: WebDriverWait(driver, 20).until( EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle")) ) except Exception as e: print("Error: Content did not load properly.") driver.quit() return pd.DataFrame() soup = BeautifulSoup(driver.page_source, "html.parser") driver.quit() # Parse the articles articles = soup.find_all('div', class_='categoryArticle') news_data = [] print(f"Found {len(articles)} articles.") for i, article in enumerate(articles): # Extract the title, link, and date using the adjusted structure headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None link = article.find('a', href=True)['href'] if article.find('a', href=True) else None date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None # Log each article's details for debugging print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}") # Only add valid entries if headline and link and date: news_data.append({ 'headline': headline, 'link': link, # Assuming the link is already a full URL 'date': date }) df = pd.DataFrame(news_data) return df def run_scraper(): news_df = scrape_oil_news() file_path = os.path.join(DATA_DIR, 'oil_news.csv') if not news_df.empty: news_df.to_csv(file_path, index=False) print(f"Oil news data saved to {file_path}") else: print("No data was scraped. The CSV file is empty.")