import json from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import os import time import re OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/" DATA_DIR = os.path.join(os.getcwd(), "data") KEYWORD_FILE_PATH = os.path.join(os.getcwd(), "assets", "oil_key_words.txt") if not os.path.exists(DATA_DIR): os.makedirs(DATA_DIR) def load_existing_data(file_path): """Load existing data from JSON file to avoid duplicates.""" if os.path.exists(file_path): with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) return [] def save_to_json(data, file_path): """Save scraped data to a JSON file, ensuring no duplicates.""" existing_data = load_existing_data(file_path) existing_links = {article['link'] for article in existing_data} new_data = [] for article in data: if article['link'] in existing_links: print(f"Skipping duplicate article: {article['headline']}") continue new_data.append(article) combined_data = existing_data + new_data with open(file_path, 'w', encoding='utf-8') as f: json.dump(combined_data, f, ensure_ascii=False, indent=4) print(f"Oil news data saved to {file_path}") def load_keyword_importance(file_path): """Load keyword importance values from the oil_key_words.txt file.""" keyword_importance = {} if os.path.exists(file_path): with open(file_path, 'r', encoding='utf-8') as f: for line in f: parts = line.strip().split() if len(parts) == 2: keyword, importance = parts keyword_importance[keyword.lower()] = int(importance) else: print(f"Keyword file not found at {file_path}") return keyword_importance keyword_importance = load_keyword_importance(KEYWORD_FILE_PATH) def extract_keywords(text, keyword_importance): """Extract important keywords from text based on an external keyword list.""" words = re.findall(r'\b\w+\b', text.lower()) keywords = {} for word in words: if len(word) > 3 and word in keyword_importance: keywords[word] = keyword_importance[word] # Store keyword with its importance # Return up to 10 unique keywords with their importance return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10] def analyze_sentiment(text): """Basic sentiment analysis placeholder with minimal processing.""" # Only check for specific keywords; avoid complex logic to save time if "profit" in text or "rise" in text: return "Positive" elif "loss" in text or "decline" in text: return "Negative" else: return "Neutral" def scrape_oil_news(): print("Scraping oil market news using Selenium...") options = Options() options.headless = True driver = webdriver.Firefox(options=options) news_data = [] page_number = 1 max_pages = 10 # Limit to 10 pages while page_number <= max_pages: print(f"Processing page {page_number}...") driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html") try: WebDriverWait(driver, 20).until( EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle")) ) except Exception as e: print(f"Error: Content did not load properly on page {page_number}.") break soup = BeautifulSoup(driver.page_source, "html.parser") articles = soup.find_all('div', class_='categoryArticle') if not articles: print(f"No articles found on page {page_number}. Ending pagination.") break for article in articles: headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None link = article.find('a', href=True)['href'] if article.find('a', href=True) else None date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None excerpt = article.find('p', class_='categoryArticle__excerpt').get_text(strip=True) if article.find('p', class_='categoryArticle__excerpt') else None author = date.split('|')[-1].strip() if '|' in date else "Unknown Author" timestamp = date.split('|')[0].strip() if '|' in date else date extracted_keywords = extract_keywords(headline + " " + excerpt if excerpt else headline, keyword_importance) if headline and link and date: news_data.append({ 'headline': headline, 'link': link, 'date': timestamp, 'author': author, 'excerpt': excerpt, 'keywords': extracted_keywords, 'sentiment_analysis': None #'sentiment_analysis': analyze_sentiment(headline + " " + excerpt if excerpt else headline) }) page_number += 1 time.sleep(2) driver.quit() return news_data def run_scraper(): file_path = os.path.join(DATA_DIR, 'oil_news.json') news_data = scrape_oil_news() save_to_json(news_data, file_path)