import json import re import os import time from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup from tqdm import tqdm # Progress bar OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/" SCRAPER_DIR = os.path.dirname(os.path.dirname(__file__)) # One level up DATA_DIR = os.path.join(SCRAPER_DIR, "data") KEYWORD_FILE_PATH = os.path.join(SCRAPER_DIR, "assets", "oil_key_words.txt") if not os.path.exists(DATA_DIR): os.makedirs(DATA_DIR) def load_existing_data(file_path): if os.path.exists(file_path): with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) return [] def save_to_json(data, file_path): existing_data = load_existing_data(file_path) existing_links = {article['link'] for article in existing_data if 'link' in article} new_data = [] for article in data: if 'link' not in article or article['link'] in existing_links: print(f"Skipping duplicate or missing link article: {article.get('headline', 'Unknown Headline')}") continue new_data.append(article) combined_data = existing_data + new_data with open(file_path, 'w', encoding='utf-8') as f: json.dump(combined_data, f, ensure_ascii=False, indent=4) print(f"Data saved to {file_path}") def load_keyword_importance(file_path): keyword_importance = {} if os.path.exists(file_path): with open(file_path, 'r', encoding='utf-8') as f: for line in f: parts = line.strip().split() if len(parts) == 2: keyword, importance = parts keyword_importance[keyword.lower()] = int(importance) else: print(f"Keyword file not found at {file_path}") return keyword_importance keyword_importance = load_keyword_importance(KEYWORD_FILE_PATH) def extract_keywords(text, keyword_importance): words = re.findall(r'\b\w+\b', text.lower()) keywords = {word: keyword_importance[word] for word in words if word in keyword_importance} return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10] def filter_content(content): """Remove advertisements, irrelevant phrases, headers, and disclaimers from content.""" patterns = [ r'ADVERTISEMENT', r'Click Here for \d+\+ Global Oil Prices', r'Find us on:', r'Back to homepage', r'Join the discussion', r'More Top Reads From Oilprice.com', r'©OilPrice\.com.*?educational purposes', r'A Media Solutions.*?Oilprice.com', r'\"It\'s most important 8 minute read of my week…\"', r'^[\w\s]*?is a [\w\s]*? for Oilprice\.com.*?More Info', r'^.*?DNOW is a supplier.*?,', ] for pattern in patterns: content = re.sub(pattern, '', content, flags=re.IGNORECASE) content = re.sub(r'\s+', ' ', content).strip() return content def scrape_author_info(driver, author_url, headline_pages=1): """Scrape author's name, bio, contributor since date, and latest article headlines with excerpts, keywords, and timestamp.""" author_name = "Unknown" author_bio = "" contributor_since = "" other_articles = [] try: # Load author page driver.get(author_url) WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.TAG_NAME, "h1")) ) page_source = driver.page_source bio_soup = BeautifulSoup(page_source, "html.parser") # Extract author name author_name_tag = bio_soup.find('h1') author_name = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author" # Extract author bio author_bio_tag = bio_soup.find('div', class_='biography') author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available" # Extract contributor since date contributor_since_tag = bio_soup.find('p', class_='contributor_since') contributor_since = contributor_since_tag.get_text(strip=True).replace("Contributor since: ", "") if contributor_since_tag else "Unknown Date" # Extract latest articles by author with heading, excerpt, keywords, and timestamp for page in range(1, headline_pages + 1): driver.get(f"{author_url}/Page-{page}.html") WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "articles")) ) page_soup = BeautifulSoup(driver.page_source, "html.parser") article_tags = page_soup.find_all('li', class_='clear') for article in article_tags: heading_tag = article.find('h3') excerpt_tag = article.find('p', class_='articlecontent') timestamp_tag = article.find('div', class_='meta') if heading_tag and excerpt_tag and timestamp_tag: heading = heading_tag.get_text(strip=True) excerpt = filter_content(excerpt_tag.get_text(strip=True)) # Use filter_content timestamp = timestamp_tag.get_text(strip=True).split("|")[0].replace("Published ", "").strip() keywords = [keyword for keyword, _ in extract_keywords(excerpt, keyword_importance)] other_articles.append({ "heading": heading, "excerpt": excerpt, "keywords": keywords, "published_date": timestamp }) except Exception as e: print(f"Error scraping author info: {e}") author_name = "Error Occurred" author_bio = str(e) contributor_since = "N/A" other_articles = [{"heading": "Error retrieving articles", "excerpt": "", "keywords": [], "published_date": ""}] return { "name": author_name, "bio": author_bio, "contributor_since": contributor_since, "other_articles": other_articles } def scrape_oil_news(): print("Scraping oil news articles for sentiment analysis...") options = Options() options.headless = True driver = webdriver.Firefox(options=options) news_data = [] page_number = 1 max_pages = 1 total_articles = 0 while page_number <= max_pages: driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html") try: WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle")) ) except: break soup = BeautifulSoup(driver.page_source, "html.parser") total_articles += len(soup.find_all('div', class_='categoryArticle')) page_number += 1 page_number = 1 with tqdm(total=total_articles, desc="Scraping articles", unit="article") as pbar: while page_number <= max_pages: print(f"\nProcessing page {page_number}...") driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html") soup = BeautifulSoup(driver.page_source, "html.parser") articles = soup.find_all('div', class_='categoryArticle') if not articles: break for article in articles: headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None link_tag = article.find('a', href=True) link = link_tag['href'] if link_tag else None date_meta = article.find('p', class_='categoryArticle__meta') date = date_meta.get_text(strip=True).split('|')[0].strip() if date_meta else None content = "" if link: print(f"Fetching article: {link}") driver.get(link) try: WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "singleArticle")) ) article_soup = BeautifulSoup(driver.page_source, "html.parser") raw_content = " ".join([p.get_text(strip=True) for p in article_soup.find_all('p')]) content = filter_content(raw_content) # Fetch author info using scrape_author_info author_url = article_soup.find('a', text=re.compile(r'More Info|Read More', re.IGNORECASE))['href'] author_info = scrape_author_info(driver, author_url, headline_pages=1) except: print(f"Error: Content did not load for article {headline}.") author_info = { "name": "Unknown", "bio": "", "contributor_since": "", "other_articles": [] } extracted_keywords = extract_keywords(f"{headline} {content}", keyword_importance) if headline and link and date: news_data.append({ 'headline': headline, 'link': link, 'content': content, 'date': date, 'author': author_info['name'], 'author_bio': author_info['bio'], 'contributor_since': author_info['contributor_since'], 'other_articles': author_info['other_articles'], 'keywords': extracted_keywords, }) pbar.set_postfix_str(f"Processing article: {headline[:40]}...") pbar.update(1) page_number += 1 time.sleep(2) driver.quit() return news_data def run_preprocessor(): file_path = os.path.join(DATA_DIR, 'preprocessed_oil_news.json') news_data = scrape_oil_news() save_to_json(news_data, file_path) if __name__ == "__main__": run_preprocessor()