added new mode

This commit is contained in:
klein panic
2024-10-31 02:40:16 -04:00
parent 1fcd98da06
commit fc7681ed68
11 changed files with 1938 additions and 2226 deletions

View File

@@ -0,0 +1,202 @@
import json
import re
import os
import time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from tqdm import tqdm # Progress bar
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
SCRAPER_DIR = os.path.dirname(os.path.dirname(__file__)) # One level up
DATA_DIR = os.path.join(SCRAPER_DIR, "data")
KEYWORD_FILE_PATH = os.path.join(SCRAPER_DIR, "assets", "oil_key_words.txt")
if not os.path.exists(DATA_DIR):
os.makedirs(DATA_DIR)
def load_existing_data(file_path):
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
return []
def save_to_json(data, file_path):
existing_data = load_existing_data(file_path)
existing_links = {article['link'] for article in existing_data if 'link' in article}
new_data = []
for article in data:
if 'link' not in article or article['link'] in existing_links:
print(f"Skipping duplicate or missing link article: {article.get('headline', 'Unknown Headline')}")
continue
new_data.append(article)
combined_data = existing_data + new_data
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(combined_data, f, ensure_ascii=False, indent=4)
print(f"Data saved to {file_path}")
def load_keyword_importance(file_path):
keyword_importance = {}
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
parts = line.strip().split()
if len(parts) == 2:
keyword, importance = parts
keyword_importance[keyword.lower()] = int(importance)
else:
print(f"Keyword file not found at {file_path}")
return keyword_importance
keyword_importance = load_keyword_importance(KEYWORD_FILE_PATH)
def extract_keywords(text, keyword_importance):
words = re.findall(r'\b\w+\b', text.lower())
keywords = {word: keyword_importance[word] for word in words if word in keyword_importance}
return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]
def filter_content(content):
"""Remove advertisements, irrelevant phrases, headers, and disclaimers from content."""
patterns = [
r'ADVERTISEMENT',
r'Click Here for \d+\+ Global Oil Prices',
r'Find us on:',
r'Back to homepage',
r'Join the discussion',
r'More Top Reads From Oilprice.com',
r'©OilPrice\.com.*?educational purposes',
r'A Media Solutions.*?Oilprice.com',
r'\"It\'s most important 8 minute read of my week…\"',
r'^[\w\s]*?is a [\w\s]*? for Oilprice\.com.*?More Info',
r'^.*?DNOW is a supplier.*?,',
]
for pattern in patterns:
content = re.sub(pattern, '', content, flags=re.IGNORECASE)
content = re.sub(r'\s+', ' ', content).strip()
return content
def extract_author_info(driver, article_soup):
"""Extract detailed author information from the 'read more' link if available."""
author = "Unknown Author"
author_bio = ""
author_tag = article_soup.find('a', text=re.compile(r'More Info|Read More', re.IGNORECASE))
if author_tag:
try:
driver.get(author_tag['href'])
# Increased wait time to handle slow-loading pages
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CLASS_NAME, "authorBio"))
)
bio_soup = BeautifulSoup(driver.page_source, "html.parser")
# Primary search for author name and bio
author_name_tag = bio_soup.find('h1')
author_bio_tag = bio_soup.find('p')
# Fallback if primary elements are not found
if not author_name_tag or not author_bio_tag:
author_name_tag = bio_soup.find('span', class_='author-name') # Hypothetical class for author name
author_bio_tag = bio_soup.find('div', class_='bio-content') # Hypothetical class for bio content
author = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
except Exception as e:
print(f"Author bio page failed to load or extract. Error: {e}")
return author, author_bio
def scrape_oil_news():
print("Scraping oil news articles for sentiment analysis...")
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
news_data = []
page_number = 1
max_pages = 1
total_articles = 0
while page_number <= max_pages:
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
)
except:
break
soup = BeautifulSoup(driver.page_source, "html.parser")
total_articles += len(soup.find_all('div', class_='categoryArticle'))
page_number += 1
page_number = 1
with tqdm(total=total_articles, desc="Scraping articles", unit="article") as pbar:
while page_number <= max_pages:
print(f"\nProcessing page {page_number}...")
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
soup = BeautifulSoup(driver.page_source, "html.parser")
articles = soup.find_all('div', class_='categoryArticle')
if not articles:
break
for article in articles:
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
link_tag = article.find('a', href=True)
link = link_tag['href'] if link_tag else None
date_meta = article.find('p', class_='categoryArticle__meta')
date = date_meta.get_text(strip=True).split('|')[0].strip() if date_meta else None
content = ""
if link:
print(f"Fetching article: {link}")
driver.get(link)
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "singleArticle"))
)
article_soup = BeautifulSoup(driver.page_source, "html.parser")
raw_content = " ".join([p.get_text(strip=True) for p in article_soup.find_all('p')])
content = filter_content(raw_content)
author, author_bio = extract_author_info(driver, article_soup)
except:
print(f"Error: Content did not load for article {headline}.")
extracted_keywords = extract_keywords(f"{headline} {content}", keyword_importance)
if headline and link and date:
news_data.append({
'headline': headline,
'link': link,
'content': content,
'date': date,
'author': author,
'author_bio': author_bio,
'keywords': extracted_keywords,
})
pbar.set_postfix_str(f"Processing article: {headline[:40]}...")
pbar.update(1)
page_number += 1
time.sleep(2)
driver.quit()
return news_data
def run_preprocessor():
file_path = os.path.join(DATA_DIR, 'preprocessed_oil_news.json')
news_data = scrape_oil_news()
save_to_json(news_data, file_path)
if __name__ == "__main__":
run_preprocessor()

View File

@@ -0,0 +1,231 @@
import json
import re
import os
import time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from tqdm import tqdm # Progress bar
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
SCRAPER_DIR = os.path.dirname(os.path.dirname(__file__)) # One level up
DATA_DIR = os.path.join(SCRAPER_DIR, "data")
KEYWORD_FILE_PATH = os.path.join(SCRAPER_DIR, "assets", "oil_key_words.txt")
if not os.path.exists(DATA_DIR):
os.makedirs(DATA_DIR)
def load_existing_data(file_path):
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
return []
def save_to_json(data, file_path):
existing_data = load_existing_data(file_path)
existing_links = {article['link'] for article in existing_data if 'link' in article}
new_data = []
for article in data:
if 'link' not in article or article['link'] in existing_links:
print(f"Skipping duplicate or missing link article: {article.get('headline', 'Unknown Headline')}")
continue
new_data.append(article)
combined_data = existing_data + new_data
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(combined_data, f, ensure_ascii=False, indent=4)
print(f"Data saved to {file_path}")
def load_keyword_importance(file_path):
keyword_importance = {}
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
parts = line.strip().split()
if len(parts) == 2:
keyword, importance = parts
keyword_importance[keyword.lower()] = int(importance)
else:
print(f"Keyword file not found at {file_path}")
return keyword_importance
keyword_importance = load_keyword_importance(KEYWORD_FILE_PATH)
def extract_keywords(text, keyword_importance):
words = re.findall(r'\b\w+\b', text.lower())
keywords = {word: keyword_importance[word] for word in words if word in keyword_importance}
return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]
def filter_content(content):
"""Remove advertisements, irrelevant phrases, headers, and disclaimers from content."""
patterns = [
r'ADVERTISEMENT',
r'Click Here for \d+\+ Global Oil Prices',
r'Find us on:',
r'Back to homepage',
r'Join the discussion',
r'More Top Reads From Oilprice.com',
r'©OilPrice\.com.*?educational purposes',
r'A Media Solutions.*?Oilprice.com',
r'\"It\'s most important 8 minute read of my week…\"',
r'^[\w\s]*?is a [\w\s]*? for Oilprice\.com.*?More Info',
r'^.*?DNOW is a supplier.*?,',
]
for pattern in patterns:
content = re.sub(pattern, '', content, flags=re.IGNORECASE)
content = re.sub(r'\s+', ' ', content).strip()
return content
def extract_author_info(driver, article_soup, headline_pages=1):
"""Extract detailed author information from the 'read more' link if available."""
author = "Unknown Author"
author_bio = ""
contributor_since = ""
other_articles = []
author_tag = article_soup.find('a', text=re.compile(r'More Info|Read More', re.IGNORECASE))
if author_tag:
retries = 3 # Set retry limit
for attempt in range(retries):
try:
driver.get(author_tag['href'])
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CLASS_NAME, "authorBio"))
)
bio_soup = BeautifulSoup(driver.page_source, "html.parser")
# Extract author's name
author_name_tag = bio_soup.find('h1')
author = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
# Extract author's bio description
author_bio_tag = bio_soup.find('p')
author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
# Extract contributor since date
contributor_since_tag = bio_soup.find(text=re.compile(r"Contributor since", re.IGNORECASE))
if contributor_since_tag:
contributor_since = contributor_since_tag.parent.get_text(strip=True).replace("Contributor since: ", "")
# Extract headlines of latest articles by the author, limited by `headline_pages`
for page in range(1, headline_pages + 1):
driver.get(f"{author_tag['href']}Page-{page}.html")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
)
page_soup = BeautifulSoup(driver.page_source, "html.parser")
article_tags = page_soup.find_all('h2', class_='categoryArticle__title')
for article in article_tags:
other_articles.append(article.get_text(strip=True))
break # Break loop if successful
except Exception as e:
print(f"Attempt {attempt + 1} failed for author bio page. Retrying...")
time.sleep(2) # Wait before retrying
if attempt == retries - 1:
print(f"Author bio page failed to load or extract after {retries} attempts. Error: {e}")
return {
"name": author,
"bio": author_bio,
"contributor_since": contributor_since,
"other_articles": other_articles
}
def scrape_oil_news():
print("Scraping oil news articles for sentiment analysis...")
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
news_data = []
page_number = 1
max_pages = 1
total_articles = 0
while page_number <= max_pages:
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
)
except:
break
soup = BeautifulSoup(driver.page_source, "html.parser")
total_articles += len(soup.find_all('div', class_='categoryArticle'))
page_number += 1
page_number = 1
with tqdm(total=total_articles, desc="Scraping articles", unit="article") as pbar:
while page_number <= max_pages:
print(f"\nProcessing page {page_number}...")
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
soup = BeautifulSoup(driver.page_source, "html.parser")
articles = soup.find_all('div', class_='categoryArticle')
if not articles:
break
for article in articles:
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
link_tag = article.find('a', href=True)
link = link_tag['href'] if link_tag else None
date_meta = article.find('p', class_='categoryArticle__meta')
date = date_meta.get_text(strip=True).split('|')[0].strip() if date_meta else None
content = ""
if link:
print(f"Fetching article: {link}")
driver.get(link)
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "singleArticle"))
)
article_soup = BeautifulSoup(driver.page_source, "html.parser")
raw_content = " ".join([p.get_text(strip=True) for p in article_soup.find_all('p')])
content = filter_content(raw_content)
author, author_bio = extract_author_info(driver, article_soup)
except:
print(f"Error: Content did not load for article {headline}.")
extracted_keywords = extract_keywords(f"{headline} {content}", keyword_importance)
if headline and link and date:
author_info = extract_author_info(driver, article_soup, headline_pages=1)
news_data.append({
'headline': headline,
'link': link,
'content': content,
'date': date,
'author': author_info['name'],
'author_bio': author_info['bio'],
'contributor_since': author_info['contributor_since'],
'other_articles': author_info['other_articles'],
'keywords': extracted_keywords,
})
pbar.set_postfix_str(f"Processing article: {headline[:40]}...")
pbar.update(1)
page_number += 1
time.sleep(2)
driver.quit()
return news_data
def run_preprocessor():
file_path = os.path.join(DATA_DIR, 'preprocessed_oil_news.json')
news_data = scrape_oil_news()
save_to_json(news_data, file_path)
if __name__ == "__main__":
run_preprocessor()

View File

@@ -11,6 +11,8 @@ import re
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
DATA_DIR = os.path.join(os.getcwd(), "data")
KEYWORD_FILE_PATH = os.path.join(os.getcwd(), "assets", "oil_key_words.txt")
if not os.path.exists(DATA_DIR):
os.makedirs(DATA_DIR)
@@ -39,15 +41,37 @@ def save_to_json(data, file_path):
json.dump(combined_data, f, ensure_ascii=False, indent=4)
print(f"Oil news data saved to {file_path}")
def extract_keywords(text):
"""Improved placeholder function to extract keywords from text."""
def load_keyword_importance(file_path):
"""Load keyword importance values from the oil_key_words.txt file."""
keyword_importance = {}
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
parts = line.strip().split()
if len(parts) == 2:
keyword, importance = parts
keyword_importance[keyword.lower()] = int(importance)
else:
print(f"Keyword file not found at {file_path}")
return keyword_importance
keyword_importance = load_keyword_importance(KEYWORD_FILE_PATH)
def extract_keywords(text, keyword_importance):
"""Extract important keywords from text based on an external keyword list."""
words = re.findall(r'\b\w+\b', text.lower())
keywords = [word for word in words if len(word) > 3] # Example filter: words longer than 3 chars
return list(set(keywords))[:10] # Return up to 10 unique keywords
keywords = {}
for word in words:
if len(word) > 3 and word in keyword_importance:
keywords[word] = keyword_importance[word] # Store keyword with its importance
# Return up to 10 unique keywords with their importance
return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]
def analyze_sentiment(text):
"""Placeholder function for sentiment analysis."""
# Basic placeholder logic (to be replaced with actual sentiment analysis)
"""Basic sentiment analysis placeholder with minimal processing."""
# Only check for specific keywords; avoid complex logic to save time
if "profit" in text or "rise" in text:
return "Positive"
elif "loss" in text or "decline" in text:
@@ -67,6 +91,7 @@ def scrape_oil_news():
max_pages = 10 # Limit to 10 pages
while page_number <= max_pages:
print(f"Processing page {page_number}...")
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
try:
@@ -91,7 +116,8 @@ def scrape_oil_news():
excerpt = article.find('p', class_='categoryArticle__excerpt').get_text(strip=True) if article.find('p', class_='categoryArticle__excerpt') else None
author = date.split('|')[-1].strip() if '|' in date else "Unknown Author"
timestamp = date.split('|')[0].strip() if '|' in date else date
extracted_keywords = extract_keywords(headline + " " + excerpt if excerpt else headline, keyword_importance)
if headline and link and date:
news_data.append({
'headline': headline,
@@ -99,8 +125,9 @@ def scrape_oil_news():
'date': timestamp,
'author': author,
'excerpt': excerpt,
'keywords': extract_keywords(headline + " " + excerpt if excerpt else headline),
'sentiment_analysis': analyze_sentiment(headline + " " + excerpt if excerpt else headline)
'keywords': extracted_keywords,
'sentiment_analysis': None
#'sentiment_analysis': analyze_sentiment(headline + " " + excerpt if excerpt else headline)
})
page_number += 1