diff --git a/Data-Collection/WebScraper/scrapers/__pycache__/oil_news_scraper.cpython-311.pyc b/Data-Collection/WebScraper/scrapers/__pycache__/oil_news_scraper.cpython-311.pyc index cdf3710..a61209e 100644 Binary files a/Data-Collection/WebScraper/scrapers/__pycache__/oil_news_scraper.cpython-311.pyc and b/Data-Collection/WebScraper/scrapers/__pycache__/oil_news_scraper.cpython-311.pyc differ diff --git a/Data-Collection/WebScraper/scrapers/backups/oil_news_scraper.py.bak b/Data-Collection/WebScraper/scrapers/backups/oil_news_scraper.py.bak index 344a3c9..d8577a8 100644 --- a/Data-Collection/WebScraper/scrapers/backups/oil_news_scraper.py.bak +++ b/Data-Collection/WebScraper/scrapers/backups/oil_news_scraper.py.bak @@ -1,76 +1,100 @@ +import json from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup -import pandas as pd import os +import time +import re -# URL for OilPrice.com homepage OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/" - -# Set up the data directory DATA_DIR = os.path.join(os.getcwd(), "data") if not os.path.exists(DATA_DIR): os.makedirs(DATA_DIR) +def load_existing_data(file_path): + """Load existing data from JSON file to avoid duplicates.""" + if os.path.exists(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + return [] + +def save_to_json(data, file_path): + """Save scraped data to a JSON file, ensuring no duplicates.""" + existing_data = load_existing_data(file_path) + existing_links = {article['link'] for article in existing_data} + + new_data = [article for article in data if article['link'] not in existing_links] + combined_data = existing_data + new_data + + with open(file_path, 'w', encoding='utf-8') as f: + json.dump(combined_data, f, ensure_ascii=False, indent=4) + print(f"Oil news data saved to {file_path}") + +def extract_keywords(text): + """Simple function to extract keywords from text.""" + keywords = re.findall(r'\b\w+\b', text.lower()) + return list(set(keywords))[:10] # Return the first 10 unique keywords + def scrape_oil_news(): print("Scraping oil market news using Selenium...") - # Set up Selenium options options = Options() options.headless = True driver = webdriver.Firefox(options=options) - driver.get(OIL_NEWS_URL) - - # Wait until 'categoryArticle' elements load - try: - WebDriverWait(driver, 20).until( - EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle")) - ) - except Exception as e: - print("Error: Content did not load properly.") - driver.quit() - return pd.DataFrame() - - soup = BeautifulSoup(driver.page_source, "html.parser") - driver.quit() - - # Parse the articles - articles = soup.find_all('div', class_='categoryArticle') news_data = [] + page_number = 1 + max_pages = 10 # Limit to 10 pages - print(f"Found {len(articles)} articles.") + while page_number <= max_pages: + # Load the page with pagination + driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html") + + try: + WebDriverWait(driver, 20).until( + EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle")) + ) + except Exception as e: + print(f"Error: Content did not load properly on page {page_number}.") + break - for i, article in enumerate(articles): - # Extract the title, link, and date using the adjusted structure - headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None - link = article.find('a', href=True)['href'] if article.find('a', href=True) else None - date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None + soup = BeautifulSoup(driver.page_source, "html.parser") + + articles = soup.find_all('div', class_='categoryArticle') + if not articles: + print(f"No articles found on page {page_number}. Ending pagination.") + break - # Log each article's details for debugging - print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}") + for article in articles: + headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None + link = article.find('a', href=True)['href'] if article.find('a', href=True) else None + date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None + excerpt = article.find('p', class_='categoryArticle__excerpt').get_text(strip=True) if article.find('p', class_='categoryArticle__excerpt') else None + author = date.split('|')[-1].strip() if '|' in date else "Unknown Author" + timestamp = date.split('|')[0].strip() if '|' in date else date + + if headline and link and date: + news_data.append({ + 'headline': headline, + 'link': link, + 'date': timestamp, + 'author': author, + 'excerpt': excerpt, + 'keywords': extract_keywords(headline + " " + excerpt if excerpt else headline), + 'sentiment_analysis': None # Placeholder for future sentiment analysis + }) - # Only add valid entries - if headline and link and date: - news_data.append({ - 'headline': headline, - 'link': link, # Assuming the link is already a full URL - 'date': date - }) + page_number += 1 + time.sleep(2) - df = pd.DataFrame(news_data) - return df + driver.quit() + return news_data def run_scraper(): - news_df = scrape_oil_news() - file_path = os.path.join(DATA_DIR, 'oil_news.csv') - - if not news_df.empty: - news_df.to_csv(file_path, index=False) - print(f"Oil news data saved to {file_path}") - else: - print("No data was scraped. The CSV file is empty.") + file_path = os.path.join(DATA_DIR, 'oil_news.json') + news_data = scrape_oil_news() + save_to_json(news_data, file_path) diff --git a/Data-Collection/WebScraper/scrapers/oil_news_scraper.py b/Data-Collection/WebScraper/scrapers/oil_news_scraper.py index 344a3c9..d8577a8 100644 --- a/Data-Collection/WebScraper/scrapers/oil_news_scraper.py +++ b/Data-Collection/WebScraper/scrapers/oil_news_scraper.py @@ -1,76 +1,100 @@ +import json from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup -import pandas as pd import os +import time +import re -# URL for OilPrice.com homepage OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/" - -# Set up the data directory DATA_DIR = os.path.join(os.getcwd(), "data") if not os.path.exists(DATA_DIR): os.makedirs(DATA_DIR) +def load_existing_data(file_path): + """Load existing data from JSON file to avoid duplicates.""" + if os.path.exists(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + return [] + +def save_to_json(data, file_path): + """Save scraped data to a JSON file, ensuring no duplicates.""" + existing_data = load_existing_data(file_path) + existing_links = {article['link'] for article in existing_data} + + new_data = [article for article in data if article['link'] not in existing_links] + combined_data = existing_data + new_data + + with open(file_path, 'w', encoding='utf-8') as f: + json.dump(combined_data, f, ensure_ascii=False, indent=4) + print(f"Oil news data saved to {file_path}") + +def extract_keywords(text): + """Simple function to extract keywords from text.""" + keywords = re.findall(r'\b\w+\b', text.lower()) + return list(set(keywords))[:10] # Return the first 10 unique keywords + def scrape_oil_news(): print("Scraping oil market news using Selenium...") - # Set up Selenium options options = Options() options.headless = True driver = webdriver.Firefox(options=options) - driver.get(OIL_NEWS_URL) - - # Wait until 'categoryArticle' elements load - try: - WebDriverWait(driver, 20).until( - EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle")) - ) - except Exception as e: - print("Error: Content did not load properly.") - driver.quit() - return pd.DataFrame() - - soup = BeautifulSoup(driver.page_source, "html.parser") - driver.quit() - - # Parse the articles - articles = soup.find_all('div', class_='categoryArticle') news_data = [] + page_number = 1 + max_pages = 10 # Limit to 10 pages - print(f"Found {len(articles)} articles.") + while page_number <= max_pages: + # Load the page with pagination + driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html") + + try: + WebDriverWait(driver, 20).until( + EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle")) + ) + except Exception as e: + print(f"Error: Content did not load properly on page {page_number}.") + break - for i, article in enumerate(articles): - # Extract the title, link, and date using the adjusted structure - headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None - link = article.find('a', href=True)['href'] if article.find('a', href=True) else None - date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None + soup = BeautifulSoup(driver.page_source, "html.parser") + + articles = soup.find_all('div', class_='categoryArticle') + if not articles: + print(f"No articles found on page {page_number}. Ending pagination.") + break - # Log each article's details for debugging - print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}") + for article in articles: + headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None + link = article.find('a', href=True)['href'] if article.find('a', href=True) else None + date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None + excerpt = article.find('p', class_='categoryArticle__excerpt').get_text(strip=True) if article.find('p', class_='categoryArticle__excerpt') else None + author = date.split('|')[-1].strip() if '|' in date else "Unknown Author" + timestamp = date.split('|')[0].strip() if '|' in date else date + + if headline and link and date: + news_data.append({ + 'headline': headline, + 'link': link, + 'date': timestamp, + 'author': author, + 'excerpt': excerpt, + 'keywords': extract_keywords(headline + " " + excerpt if excerpt else headline), + 'sentiment_analysis': None # Placeholder for future sentiment analysis + }) - # Only add valid entries - if headline and link and date: - news_data.append({ - 'headline': headline, - 'link': link, # Assuming the link is already a full URL - 'date': date - }) + page_number += 1 + time.sleep(2) - df = pd.DataFrame(news_data) - return df + driver.quit() + return news_data def run_scraper(): - news_df = scrape_oil_news() - file_path = os.path.join(DATA_DIR, 'oil_news.csv') - - if not news_df.empty: - news_df.to_csv(file_path, index=False) - print(f"Oil news data saved to {file_path}") - else: - print("No data was scraped. The CSV file is empty.") + file_path = os.path.join(DATA_DIR, 'oil_news.json') + news_data = scrape_oil_news() + save_to_json(news_data, file_path)