oil scraper works

2024-10-30 23:47:32 -04:00
parent b6e0578b2d
commit 3ae788ed9b
3 changed files with 114 additions and 29 deletions
--- a/Data-Collection/WebScraper/scrapers/pycache/oil_news_scraper.cpython-311.pyc
+++ b/Data-Collection/WebScraper/scrapers/pycache/oil_news_scraper.cpython-311.pyc
--- a/Data-Collection/WebScraper/scrapers/backups/oil_news_scraper.py.bak
+++ b/Data-Collection/WebScraper/scrapers/backups/oil_news_scraper.py.bak
@@ -0,0 +1,76 @@
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+import pandas as pd
+import os
+
+# URL for OilPrice.com homepage
+OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
+
+# Set up the data directory
+DATA_DIR = os.path.join(os.getcwd(), "data")
+if not os.path.exists(DATA_DIR):
+    os.makedirs(DATA_DIR)
+
+def scrape_oil_news():
+    print("Scraping oil market news using Selenium...")
+
+    # Set up Selenium options
+    options = Options()
+    options.headless = True
+    driver = webdriver.Firefox(options=options)
+
+    driver.get(OIL_NEWS_URL)
+
+    # Wait until 'categoryArticle' elements load
+    try:
+        WebDriverWait(driver, 20).until(
+            EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
+        )
+    except Exception as e:
+        print("Error: Content did not load properly.")
+        driver.quit()
+        return pd.DataFrame()
+
+    soup = BeautifulSoup(driver.page_source, "html.parser")
+    driver.quit()
+
+    # Parse the articles
+    articles = soup.find_all('div', class_='categoryArticle')
+    news_data = []
+
+    print(f"Found {len(articles)} articles.")
+
+    for i, article in enumerate(articles):
+        # Extract the title, link, and date using the adjusted structure
+        headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
+        link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
+        date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
+
+        # Log each article's details for debugging
+        print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}")
+
+        # Only add valid entries
+        if headline and link and date:
+            news_data.append({
+                'headline': headline,
+                'link': link,  # Assuming the link is already a full URL
+                'date': date
+            })
+
+    df = pd.DataFrame(news_data)
+    return df
+
+def run_scraper():
+    news_df = scrape_oil_news()
+    file_path = os.path.join(DATA_DIR, 'oil_news.csv')
+
+    if not news_df.empty:
+        news_df.to_csv(file_path, index=False)
+        print(f"Oil news data saved to {file_path}")
+    else:
+        print("No data was scraped. The CSV file is empty.")
+
--- a/Data-Collection/WebScraper/scrapers/oil_news_scraper.py
+++ b/Data-Collection/WebScraper/scrapers/oil_news_scraper.py
@@ -1,6 +1,8 @@
-# scrapers/oil_news_scraper.py
-
-import requests
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
 from bs4 import BeautifulSoup
 import pandas as pd
 import os
@@ -8,60 +10,67 @@ import os
 # URL for OilPrice.com homepage
 OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"

-# Define the directory to store the scraped data
+# Set up the data directory
 DATA_DIR = os.path.join(os.getcwd(), "data")
 if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

-# Function to scrape news headlines from OilPrice.com
 def scrape_oil_news():
-    print("Scraping oil market news...")
+    print("Scraping oil market news using Selenium...")

-    # Send an HTTP request to the website
-    response = requests.get(OIL_NEWS_URL)
-    response.raise_for_status()
+    # Set up Selenium options
+    options = Options()
+    options.headless = True
+    driver = webdriver.Firefox(options=options)

-    # Print the HTML to see what we are working with
-    print(response.text[:1000])  # Print only the first 1000 characters for brevity
+    driver.get(OIL_NEWS_URL)

-    # Parse the HTML using BeautifulSoup
-    soup = BeautifulSoup(response.text, "html.parser")
+    # Wait until 'categoryArticle' elements load
+    try:
+        WebDriverWait(driver, 20).until(
+            EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
+        )
+    except Exception as e:
+        print("Error: Content did not load properly.")
+        driver.quit()
+        return pd.DataFrame()

-    # Find all news article containers (class names updated)
+    soup = BeautifulSoup(driver.page_source, "html.parser")
+    driver.quit()
+
+    # Parse the articles
    articles = soup.find_all('div', class_='categoryArticle')
-
-    # List to store the scraped data
    news_data = []

-    # Loop through each article container
-    for article in articles:
-        # Extract the headline, date, and link
-        headline = article.find('a').get_text(strip=True) if article.find('a') else None
-        link = article.find('a')['href'] if article.find('a') else None
-        date = article.find('span', class_='categoryArticle__date').get_text(strip=True) if article.find('span', class_='categoryArticle__date') else None
+    print(f"Found {len(articles)} articles.")

-        # Only append valid data
+    for i, article in enumerate(articles):
+        # Extract the title, link, and date using the adjusted structure
+        headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
+        link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
+        date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
+
+        # Log each article's details for debugging
+        print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}")
+
+        # Only add valid entries
        if headline and link and date:
            news_data.append({
                'headline': headline,
-                'link': f"https://oilprice.com{link}",
+                'link': link,  # Assuming the link is already a full URL
                'date': date
            })

    df = pd.DataFrame(news_data)
    return df

-# Function to run the scraper and save data
 def run_scraper():
-    # Scrape oil news
    news_df = scrape_oil_news()
-
-    # Define the file path for saving the data
    file_path = os.path.join(DATA_DIR, 'oil_news.csv')

-    # Save the DataFrame to a CSV file
    if not news_df.empty:
        news_df.to_csv(file_path, index=False)
        print(f"Oil news data saved to {file_path}")
    else:
        print("No data was scraped. The CSV file is empty.")
+