Files
MidasEngine/Data-Collection/WebScraper/scrapers/backups/oil_news_preprocessor.py.bak
2024-10-31 02:40:16 -04:00

203 lines
8.1 KiB
Python

import json
import re
import os
import time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from tqdm import tqdm # Progress bar
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
SCRAPER_DIR = os.path.dirname(os.path.dirname(__file__)) # One level up
DATA_DIR = os.path.join(SCRAPER_DIR, "data")
KEYWORD_FILE_PATH = os.path.join(SCRAPER_DIR, "assets", "oil_key_words.txt")
if not os.path.exists(DATA_DIR):
os.makedirs(DATA_DIR)
def load_existing_data(file_path):
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
return []
def save_to_json(data, file_path):
existing_data = load_existing_data(file_path)
existing_links = {article['link'] for article in existing_data if 'link' in article}
new_data = []
for article in data:
if 'link' not in article or article['link'] in existing_links:
print(f"Skipping duplicate or missing link article: {article.get('headline', 'Unknown Headline')}")
continue
new_data.append(article)
combined_data = existing_data + new_data
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(combined_data, f, ensure_ascii=False, indent=4)
print(f"Data saved to {file_path}")
def load_keyword_importance(file_path):
keyword_importance = {}
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
parts = line.strip().split()
if len(parts) == 2:
keyword, importance = parts
keyword_importance[keyword.lower()] = int(importance)
else:
print(f"Keyword file not found at {file_path}")
return keyword_importance
keyword_importance = load_keyword_importance(KEYWORD_FILE_PATH)
def extract_keywords(text, keyword_importance):
words = re.findall(r'\b\w+\b', text.lower())
keywords = {word: keyword_importance[word] for word in words if word in keyword_importance}
return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]
def filter_content(content):
"""Remove advertisements, irrelevant phrases, headers, and disclaimers from content."""
patterns = [
r'ADVERTISEMENT',
r'Click Here for \d+\+ Global Oil Prices',
r'Find us on:',
r'Back to homepage',
r'Join the discussion',
r'More Top Reads From Oilprice.com',
r'©OilPrice\.com.*?educational purposes',
r'A Media Solutions.*?Oilprice.com',
r'\"It\'s most important 8 minute read of my week…\"',
r'^[\w\s]*?is a [\w\s]*? for Oilprice\.com.*?More Info',
r'^.*?DNOW is a supplier.*?,',
]
for pattern in patterns:
content = re.sub(pattern, '', content, flags=re.IGNORECASE)
content = re.sub(r'\s+', ' ', content).strip()
return content
def extract_author_info(driver, article_soup):
"""Extract detailed author information from the 'read more' link if available."""
author = "Unknown Author"
author_bio = ""
author_tag = article_soup.find('a', text=re.compile(r'More Info|Read More', re.IGNORECASE))
if author_tag:
try:
driver.get(author_tag['href'])
# Increased wait time to handle slow-loading pages
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CLASS_NAME, "authorBio"))
)
bio_soup = BeautifulSoup(driver.page_source, "html.parser")
# Primary search for author name and bio
author_name_tag = bio_soup.find('h1')
author_bio_tag = bio_soup.find('p')
# Fallback if primary elements are not found
if not author_name_tag or not author_bio_tag:
author_name_tag = bio_soup.find('span', class_='author-name') # Hypothetical class for author name
author_bio_tag = bio_soup.find('div', class_='bio-content') # Hypothetical class for bio content
author = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
except Exception as e:
print(f"Author bio page failed to load or extract. Error: {e}")
return author, author_bio
def scrape_oil_news():
print("Scraping oil news articles for sentiment analysis...")
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
news_data = []
page_number = 1
max_pages = 1
total_articles = 0
while page_number <= max_pages:
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
)
except:
break
soup = BeautifulSoup(driver.page_source, "html.parser")
total_articles += len(soup.find_all('div', class_='categoryArticle'))
page_number += 1
page_number = 1
with tqdm(total=total_articles, desc="Scraping articles", unit="article") as pbar:
while page_number <= max_pages:
print(f"\nProcessing page {page_number}...")
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
soup = BeautifulSoup(driver.page_source, "html.parser")
articles = soup.find_all('div', class_='categoryArticle')
if not articles:
break
for article in articles:
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
link_tag = article.find('a', href=True)
link = link_tag['href'] if link_tag else None
date_meta = article.find('p', class_='categoryArticle__meta')
date = date_meta.get_text(strip=True).split('|')[0].strip() if date_meta else None
content = ""
if link:
print(f"Fetching article: {link}")
driver.get(link)
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "singleArticle"))
)
article_soup = BeautifulSoup(driver.page_source, "html.parser")
raw_content = " ".join([p.get_text(strip=True) for p in article_soup.find_all('p')])
content = filter_content(raw_content)
author, author_bio = extract_author_info(driver, article_soup)
except:
print(f"Error: Content did not load for article {headline}.")
extracted_keywords = extract_keywords(f"{headline} {content}", keyword_importance)
if headline and link and date:
news_data.append({
'headline': headline,
'link': link,
'content': content,
'date': date,
'author': author,
'author_bio': author_bio,
'keywords': extracted_keywords,
})
pbar.set_postfix_str(f"Processing article: {headline[:40]}...")
pbar.update(1)
page_number += 1
time.sleep(2)
driver.quit()
return news_data
def run_preprocessor():
file_path = os.path.join(DATA_DIR, 'preprocessed_oil_news.json')
news_data = scrape_oil_news()
save_to_json(news_data, file_path)
if __name__ == "__main__":
run_preprocessor()