switched to json, added keyword attraction, author extraction,excerpts exctraction, timestamp data, added future placeholder for rudimentary sentiment analysis maybe. Increased the number of articles used, added cap for article's analyzed to save cpu data, added a function to avoid repeat data collection of the same article

This commit is contained in:
klein panic
2024-10-31 00:04:22 -04:00
parent 3ae788ed9b
commit e638bea1da
3 changed files with 142 additions and 94 deletions

View File

@@ -1,76 +1,100 @@
import json
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
import re
# URL for OilPrice.com homepage
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
# Set up the data directory
DATA_DIR = os.path.join(os.getcwd(), "data")
if not os.path.exists(DATA_DIR):
os.makedirs(DATA_DIR)
def load_existing_data(file_path):
"""Load existing data from JSON file to avoid duplicates."""
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
return []
def save_to_json(data, file_path):
"""Save scraped data to a JSON file, ensuring no duplicates."""
existing_data = load_existing_data(file_path)
existing_links = {article['link'] for article in existing_data}
new_data = [article for article in data if article['link'] not in existing_links]
combined_data = existing_data + new_data
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(combined_data, f, ensure_ascii=False, indent=4)
print(f"Oil news data saved to {file_path}")
def extract_keywords(text):
"""Simple function to extract keywords from text."""
keywords = re.findall(r'\b\w+\b', text.lower())
return list(set(keywords))[:10] # Return the first 10 unique keywords
def scrape_oil_news():
print("Scraping oil market news using Selenium...")
# Set up Selenium options
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
driver.get(OIL_NEWS_URL)
# Wait until 'categoryArticle' elements load
try:
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
)
except Exception as e:
print("Error: Content did not load properly.")
driver.quit()
return pd.DataFrame()
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()
# Parse the articles
articles = soup.find_all('div', class_='categoryArticle')
news_data = []
page_number = 1
max_pages = 10 # Limit to 10 pages
print(f"Found {len(articles)} articles.")
while page_number <= max_pages:
# Load the page with pagination
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
try:
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
)
except Exception as e:
print(f"Error: Content did not load properly on page {page_number}.")
break
for i, article in enumerate(articles):
# Extract the title, link, and date using the adjusted structure
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
soup = BeautifulSoup(driver.page_source, "html.parser")
articles = soup.find_all('div', class_='categoryArticle')
if not articles:
print(f"No articles found on page {page_number}. Ending pagination.")
break
# Log each article's details for debugging
print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}")
for article in articles:
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
excerpt = article.find('p', class_='categoryArticle__excerpt').get_text(strip=True) if article.find('p', class_='categoryArticle__excerpt') else None
author = date.split('|')[-1].strip() if '|' in date else "Unknown Author"
timestamp = date.split('|')[0].strip() if '|' in date else date
if headline and link and date:
news_data.append({
'headline': headline,
'link': link,
'date': timestamp,
'author': author,
'excerpt': excerpt,
'keywords': extract_keywords(headline + " " + excerpt if excerpt else headline),
'sentiment_analysis': None # Placeholder for future sentiment analysis
})
# Only add valid entries
if headline and link and date:
news_data.append({
'headline': headline,
'link': link, # Assuming the link is already a full URL
'date': date
})
page_number += 1
time.sleep(2)
df = pd.DataFrame(news_data)
return df
driver.quit()
return news_data
def run_scraper():
news_df = scrape_oil_news()
file_path = os.path.join(DATA_DIR, 'oil_news.csv')
if not news_df.empty:
news_df.to_csv(file_path, index=False)
print(f"Oil news data saved to {file_path}")
else:
print("No data was scraped. The CSV file is empty.")
file_path = os.path.join(DATA_DIR, 'oil_news.json')
news_data = scrape_oil_news()
save_to_json(news_data, file_path)

View File

@@ -1,76 +1,100 @@
import json
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
import re
# URL for OilPrice.com homepage
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
# Set up the data directory
DATA_DIR = os.path.join(os.getcwd(), "data")
if not os.path.exists(DATA_DIR):
os.makedirs(DATA_DIR)
def load_existing_data(file_path):
"""Load existing data from JSON file to avoid duplicates."""
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
return []
def save_to_json(data, file_path):
"""Save scraped data to a JSON file, ensuring no duplicates."""
existing_data = load_existing_data(file_path)
existing_links = {article['link'] for article in existing_data}
new_data = [article for article in data if article['link'] not in existing_links]
combined_data = existing_data + new_data
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(combined_data, f, ensure_ascii=False, indent=4)
print(f"Oil news data saved to {file_path}")
def extract_keywords(text):
"""Simple function to extract keywords from text."""
keywords = re.findall(r'\b\w+\b', text.lower())
return list(set(keywords))[:10] # Return the first 10 unique keywords
def scrape_oil_news():
print("Scraping oil market news using Selenium...")
# Set up Selenium options
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
driver.get(OIL_NEWS_URL)
# Wait until 'categoryArticle' elements load
try:
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
)
except Exception as e:
print("Error: Content did not load properly.")
driver.quit()
return pd.DataFrame()
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()
# Parse the articles
articles = soup.find_all('div', class_='categoryArticle')
news_data = []
page_number = 1
max_pages = 10 # Limit to 10 pages
print(f"Found {len(articles)} articles.")
while page_number <= max_pages:
# Load the page with pagination
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
try:
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
)
except Exception as e:
print(f"Error: Content did not load properly on page {page_number}.")
break
for i, article in enumerate(articles):
# Extract the title, link, and date using the adjusted structure
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
soup = BeautifulSoup(driver.page_source, "html.parser")
articles = soup.find_all('div', class_='categoryArticle')
if not articles:
print(f"No articles found on page {page_number}. Ending pagination.")
break
# Log each article's details for debugging
print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}")
for article in articles:
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
excerpt = article.find('p', class_='categoryArticle__excerpt').get_text(strip=True) if article.find('p', class_='categoryArticle__excerpt') else None
author = date.split('|')[-1].strip() if '|' in date else "Unknown Author"
timestamp = date.split('|')[0].strip() if '|' in date else date
if headline and link and date:
news_data.append({
'headline': headline,
'link': link,
'date': timestamp,
'author': author,
'excerpt': excerpt,
'keywords': extract_keywords(headline + " " + excerpt if excerpt else headline),
'sentiment_analysis': None # Placeholder for future sentiment analysis
})
# Only add valid entries
if headline and link and date:
news_data.append({
'headline': headline,
'link': link, # Assuming the link is already a full URL
'date': date
})
page_number += 1
time.sleep(2)
df = pd.DataFrame(news_data)
return df
driver.quit()
return news_data
def run_scraper():
news_df = scrape_oil_news()
file_path = os.path.join(DATA_DIR, 'oil_news.csv')
if not news_df.empty:
news_df.to_csv(file_path, index=False)
print(f"Oil news data saved to {file_path}")
else:
print("No data was scraped. The CSV file is empty.")
file_path = os.path.join(DATA_DIR, 'oil_news.json')
news_data = scrape_oil_news()
save_to_json(news_data, file_path)