oil scraper works

2024-10-30 23:47:32 -04:00
parent b6e0578b2d
commit 3ae788ed9b
3 changed files with 114 additions and 29 deletions
--- a/Data-Collection/WebScraper/scrapers/pycache/oil_news_scraper.cpython-311.pyc
+++ b/Data-Collection/WebScraper/scrapers/pycache/oil_news_scraper.cpython-311.pyc
--- a/Data-Collection/WebScraper/scrapers/backups/oil_news_scraper.py.bak
+++ b/Data-Collection/WebScraper/scrapers/backups/oil_news_scraper.py.bak
@@ -0,0 +1,76 @@
 from selenium import webdriver
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from bs4 import BeautifulSoup
 import pandas as pd
 import os
 # URL for OilPrice.com homepage
 OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
 # Set up the data directory
 DATA_DIR = os.path.join(os.getcwd(), "data")
 if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)
 def scrape_oil_news():
    print("Scraping oil market news using Selenium...")
    # Set up Selenium options
    options = Options()
    options.headless = True
    driver = webdriver.Firefox(options=options)
    driver.get(OIL_NEWS_URL)
    # Wait until 'categoryArticle' elements load
    try:
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
        )
    except Exception as e:
        print("Error: Content did not load properly.")
        driver.quit()
        return pd.DataFrame()
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()
    # Parse the articles
    articles = soup.find_all('div', class_='categoryArticle')
    news_data = []
    print(f"Found {len(articles)} articles.")
    for i, article in enumerate(articles):
        # Extract the title, link, and date using the adjusted structure
        headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
        link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
        date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
        # Log each article's details for debugging
        print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}")
        # Only add valid entries
        if headline and link and date:
            news_data.append({
                'headline': headline,
                'link': link,  # Assuming the link is already a full URL
                'date': date
            })
    df = pd.DataFrame(news_data)
    return df
 def run_scraper():
    news_df = scrape_oil_news()
    file_path = os.path.join(DATA_DIR, 'oil_news.csv')
    if not news_df.empty:
        news_df.to_csv(file_path, index=False)
        print(f"Oil news data saved to {file_path}")
    else:
        print("No data was scraped. The CSV file is empty.")
--- a/Data-Collection/WebScraper/scrapers/oil_news_scraper.py
+++ b/Data-Collection/WebScraper/scrapers/oil_news_scraper.py
@@ -1,6 +1,8 @@
-# scrapers/oil_news_scraper.py
+from selenium import webdriver
-
+from selenium.webdriver.firefox.options import Options
-import requests
+from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from bs4 import BeautifulSoup
 import pandas as pd
 import os
@@ -8,60 +10,67 @@ import os
 # URL for OilPrice.com homepage
 OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
-# Define the directory to store the scraped data
+# Set up the data directory
 DATA_DIR = os.path.join(os.getcwd(), "data")
 if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)
 # Function to scrape news headlines from OilPrice.com
 def scrape_oil_news():
-    print("Scraping oil market news...")
+    print("Scraping oil market news using Selenium...")
-    # Send an HTTP request to the website
+    # Set up Selenium options
-    response = requests.get(OIL_NEWS_URL)
+    options = Options()
-    response.raise_for_status()
+    options.headless = True
    driver = webdriver.Firefox(options=options)
-    # Print the HTML to see what we are working with
+    driver.get(OIL_NEWS_URL)
    print(response.text[:1000])  # Print only the first 1000 characters for brevity
-    # Parse the HTML using BeautifulSoup
+    # Wait until 'categoryArticle' elements load
-    soup = BeautifulSoup(response.text, "html.parser")
+    try:
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
        )
    except Exception as e:
        print("Error: Content did not load properly.")
        driver.quit()
        return pd.DataFrame()
-    # Find all news article containers (class names updated)
+    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()
    # Parse the articles
    articles = soup.find_all('div', class_='categoryArticle')
    # List to store the scraped data
    news_data = []
-    # Loop through each article container
+    print(f"Found {len(articles)} articles.")
    for article in articles:
        # Extract the headline, date, and link
        headline = article.find('a').get_text(strip=True) if article.find('a') else None
        link = article.find('a')['href'] if article.find('a') else None
        date = article.find('span', class_='categoryArticle__date').get_text(strip=True) if article.find('span', class_='categoryArticle__date') else None
-        # Only append valid data
+    for i, article in enumerate(articles):
        # Extract the title, link, and date using the adjusted structure
        headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
        link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
        date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
        # Log each article's details for debugging
        print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}")
        # Only add valid entries
        if headline and link and date:
            news_data.append({
                'headline': headline,
-                'link': f"https://oilprice.com{link}",
+                'link': link,  # Assuming the link is already a full URL
                'date': date
            })
    df = pd.DataFrame(news_data)
    return df
 # Function to run the scraper and save data
 def run_scraper():
    # Scrape oil news
    news_df = scrape_oil_news()
    # Define the file path for saving the data
    file_path = os.path.join(DATA_DIR, 'oil_news.csv')
    # Save the DataFrame to a CSV file
    if not news_df.empty:
        news_df.to_csv(file_path, index=False)
        print(f"Oil news data saved to {file_path}")
    else:
        print("No data was scraped. The CSV file is empty.")