diff --git a/Data-Collection/WebScraper/scrapers/__pycache__/oil_news_scraper.cpython-311.pyc b/Data-Collection/WebScraper/scrapers/__pycache__/oil_news_scraper.cpython-311.pyc index 3639622..cdf3710 100644 Binary files a/Data-Collection/WebScraper/scrapers/__pycache__/oil_news_scraper.cpython-311.pyc and b/Data-Collection/WebScraper/scrapers/__pycache__/oil_news_scraper.cpython-311.pyc differ diff --git a/Data-Collection/WebScraper/scrapers/backups/oil_news_scraper.py.bak b/Data-Collection/WebScraper/scrapers/backups/oil_news_scraper.py.bak new file mode 100644 index 0000000..344a3c9 --- /dev/null +++ b/Data-Collection/WebScraper/scrapers/backups/oil_news_scraper.py.bak @@ -0,0 +1,76 @@ +from selenium import webdriver +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from bs4 import BeautifulSoup +import pandas as pd +import os + +# URL for OilPrice.com homepage +OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/" + +# Set up the data directory +DATA_DIR = os.path.join(os.getcwd(), "data") +if not os.path.exists(DATA_DIR): + os.makedirs(DATA_DIR) + +def scrape_oil_news(): + print("Scraping oil market news using Selenium...") + + # Set up Selenium options + options = Options() + options.headless = True + driver = webdriver.Firefox(options=options) + + driver.get(OIL_NEWS_URL) + + # Wait until 'categoryArticle' elements load + try: + WebDriverWait(driver, 20).until( + EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle")) + ) + except Exception as e: + print("Error: Content did not load properly.") + driver.quit() + return pd.DataFrame() + + soup = BeautifulSoup(driver.page_source, "html.parser") + driver.quit() + + # Parse the articles + articles = soup.find_all('div', class_='categoryArticle') + news_data = [] + + print(f"Found {len(articles)} articles.") + + for i, article in enumerate(articles): + # Extract the title, link, and date using the adjusted structure + headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None + link = article.find('a', href=True)['href'] if article.find('a', href=True) else None + date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None + + # Log each article's details for debugging + print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}") + + # Only add valid entries + if headline and link and date: + news_data.append({ + 'headline': headline, + 'link': link, # Assuming the link is already a full URL + 'date': date + }) + + df = pd.DataFrame(news_data) + return df + +def run_scraper(): + news_df = scrape_oil_news() + file_path = os.path.join(DATA_DIR, 'oil_news.csv') + + if not news_df.empty: + news_df.to_csv(file_path, index=False) + print(f"Oil news data saved to {file_path}") + else: + print("No data was scraped. The CSV file is empty.") + diff --git a/Data-Collection/WebScraper/scrapers/oil_news_scraper.py b/Data-Collection/WebScraper/scrapers/oil_news_scraper.py index b0791d2..344a3c9 100644 --- a/Data-Collection/WebScraper/scrapers/oil_news_scraper.py +++ b/Data-Collection/WebScraper/scrapers/oil_news_scraper.py @@ -1,6 +1,8 @@ -# scrapers/oil_news_scraper.py - -import requests +from selenium import webdriver +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import pandas as pd import os @@ -8,60 +10,67 @@ import os # URL for OilPrice.com homepage OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/" -# Define the directory to store the scraped data +# Set up the data directory DATA_DIR = os.path.join(os.getcwd(), "data") if not os.path.exists(DATA_DIR): os.makedirs(DATA_DIR) -# Function to scrape news headlines from OilPrice.com def scrape_oil_news(): - print("Scraping oil market news...") + print("Scraping oil market news using Selenium...") - # Send an HTTP request to the website - response = requests.get(OIL_NEWS_URL) - response.raise_for_status() + # Set up Selenium options + options = Options() + options.headless = True + driver = webdriver.Firefox(options=options) - # Print the HTML to see what we are working with - print(response.text[:1000]) # Print only the first 1000 characters for brevity + driver.get(OIL_NEWS_URL) - # Parse the HTML using BeautifulSoup - soup = BeautifulSoup(response.text, "html.parser") + # Wait until 'categoryArticle' elements load + try: + WebDriverWait(driver, 20).until( + EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle")) + ) + except Exception as e: + print("Error: Content did not load properly.") + driver.quit() + return pd.DataFrame() - # Find all news article containers (class names updated) + soup = BeautifulSoup(driver.page_source, "html.parser") + driver.quit() + + # Parse the articles articles = soup.find_all('div', class_='categoryArticle') - - # List to store the scraped data news_data = [] - # Loop through each article container - for article in articles: - # Extract the headline, date, and link - headline = article.find('a').get_text(strip=True) if article.find('a') else None - link = article.find('a')['href'] if article.find('a') else None - date = article.find('span', class_='categoryArticle__date').get_text(strip=True) if article.find('span', class_='categoryArticle__date') else None + print(f"Found {len(articles)} articles.") - # Only append valid data + for i, article in enumerate(articles): + # Extract the title, link, and date using the adjusted structure + headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None + link = article.find('a', href=True)['href'] if article.find('a', href=True) else None + date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None + + # Log each article's details for debugging + print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}") + + # Only add valid entries if headline and link and date: news_data.append({ 'headline': headline, - 'link': f"https://oilprice.com{link}", + 'link': link, # Assuming the link is already a full URL 'date': date }) df = pd.DataFrame(news_data) return df -# Function to run the scraper and save data def run_scraper(): - # Scrape oil news news_df = scrape_oil_news() - - # Define the file path for saving the data file_path = os.path.join(DATA_DIR, 'oil_news.csv') - # Save the DataFrame to a CSV file if not news_df.empty: news_df.to_csv(file_path, index=False) print(f"Oil news data saved to {file_path}") else: print("No data was scraped. The CSV file is empty.") +