switched to json, added keyword attraction, author extraction,excerpts exctraction, timestamp data, added future placeholder for rudimentary sentiment analysis maybe. Increased the number of articles used, added cap for article's analyzed to save cpu data, added a function to avoid repeat data collection of the same article
This commit is contained in:
Binary file not shown.
@@ -1,76 +1,100 @@
|
||||
import json
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
import os
|
||||
import time
|
||||
import re
|
||||
|
||||
# URL for OilPrice.com homepage
|
||||
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
|
||||
|
||||
# Set up the data directory
|
||||
DATA_DIR = os.path.join(os.getcwd(), "data")
|
||||
if not os.path.exists(DATA_DIR):
|
||||
os.makedirs(DATA_DIR)
|
||||
|
||||
def load_existing_data(file_path):
|
||||
"""Load existing data from JSON file to avoid duplicates."""
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
return []
|
||||
|
||||
def save_to_json(data, file_path):
|
||||
"""Save scraped data to a JSON file, ensuring no duplicates."""
|
||||
existing_data = load_existing_data(file_path)
|
||||
existing_links = {article['link'] for article in existing_data}
|
||||
|
||||
new_data = [article for article in data if article['link'] not in existing_links]
|
||||
combined_data = existing_data + new_data
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(combined_data, f, ensure_ascii=False, indent=4)
|
||||
print(f"Oil news data saved to {file_path}")
|
||||
|
||||
def extract_keywords(text):
|
||||
"""Simple function to extract keywords from text."""
|
||||
keywords = re.findall(r'\b\w+\b', text.lower())
|
||||
return list(set(keywords))[:10] # Return the first 10 unique keywords
|
||||
|
||||
def scrape_oil_news():
|
||||
print("Scraping oil market news using Selenium...")
|
||||
|
||||
# Set up Selenium options
|
||||
options = Options()
|
||||
options.headless = True
|
||||
driver = webdriver.Firefox(options=options)
|
||||
|
||||
driver.get(OIL_NEWS_URL)
|
||||
|
||||
# Wait until 'categoryArticle' elements load
|
||||
try:
|
||||
WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
|
||||
)
|
||||
except Exception as e:
|
||||
print("Error: Content did not load properly.")
|
||||
driver.quit()
|
||||
return pd.DataFrame()
|
||||
|
||||
soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
driver.quit()
|
||||
|
||||
# Parse the articles
|
||||
articles = soup.find_all('div', class_='categoryArticle')
|
||||
news_data = []
|
||||
page_number = 1
|
||||
max_pages = 10 # Limit to 10 pages
|
||||
|
||||
print(f"Found {len(articles)} articles.")
|
||||
while page_number <= max_pages:
|
||||
# Load the page with pagination
|
||||
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
|
||||
|
||||
try:
|
||||
WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error: Content did not load properly on page {page_number}.")
|
||||
break
|
||||
|
||||
for i, article in enumerate(articles):
|
||||
# Extract the title, link, and date using the adjusted structure
|
||||
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
|
||||
link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
|
||||
date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
|
||||
soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
|
||||
articles = soup.find_all('div', class_='categoryArticle')
|
||||
if not articles:
|
||||
print(f"No articles found on page {page_number}. Ending pagination.")
|
||||
break
|
||||
|
||||
# Log each article's details for debugging
|
||||
print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}")
|
||||
for article in articles:
|
||||
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
|
||||
link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
|
||||
date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
|
||||
excerpt = article.find('p', class_='categoryArticle__excerpt').get_text(strip=True) if article.find('p', class_='categoryArticle__excerpt') else None
|
||||
author = date.split('|')[-1].strip() if '|' in date else "Unknown Author"
|
||||
timestamp = date.split('|')[0].strip() if '|' in date else date
|
||||
|
||||
if headline and link and date:
|
||||
news_data.append({
|
||||
'headline': headline,
|
||||
'link': link,
|
||||
'date': timestamp,
|
||||
'author': author,
|
||||
'excerpt': excerpt,
|
||||
'keywords': extract_keywords(headline + " " + excerpt if excerpt else headline),
|
||||
'sentiment_analysis': None # Placeholder for future sentiment analysis
|
||||
})
|
||||
|
||||
# Only add valid entries
|
||||
if headline and link and date:
|
||||
news_data.append({
|
||||
'headline': headline,
|
||||
'link': link, # Assuming the link is already a full URL
|
||||
'date': date
|
||||
})
|
||||
page_number += 1
|
||||
time.sleep(2)
|
||||
|
||||
df = pd.DataFrame(news_data)
|
||||
return df
|
||||
driver.quit()
|
||||
return news_data
|
||||
|
||||
def run_scraper():
|
||||
news_df = scrape_oil_news()
|
||||
file_path = os.path.join(DATA_DIR, 'oil_news.csv')
|
||||
|
||||
if not news_df.empty:
|
||||
news_df.to_csv(file_path, index=False)
|
||||
print(f"Oil news data saved to {file_path}")
|
||||
else:
|
||||
print("No data was scraped. The CSV file is empty.")
|
||||
file_path = os.path.join(DATA_DIR, 'oil_news.json')
|
||||
news_data = scrape_oil_news()
|
||||
save_to_json(news_data, file_path)
|
||||
|
||||
|
||||
@@ -1,76 +1,100 @@
|
||||
import json
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
import os
|
||||
import time
|
||||
import re
|
||||
|
||||
# URL for OilPrice.com homepage
|
||||
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
|
||||
|
||||
# Set up the data directory
|
||||
DATA_DIR = os.path.join(os.getcwd(), "data")
|
||||
if not os.path.exists(DATA_DIR):
|
||||
os.makedirs(DATA_DIR)
|
||||
|
||||
def load_existing_data(file_path):
|
||||
"""Load existing data from JSON file to avoid duplicates."""
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
return []
|
||||
|
||||
def save_to_json(data, file_path):
|
||||
"""Save scraped data to a JSON file, ensuring no duplicates."""
|
||||
existing_data = load_existing_data(file_path)
|
||||
existing_links = {article['link'] for article in existing_data}
|
||||
|
||||
new_data = [article for article in data if article['link'] not in existing_links]
|
||||
combined_data = existing_data + new_data
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(combined_data, f, ensure_ascii=False, indent=4)
|
||||
print(f"Oil news data saved to {file_path}")
|
||||
|
||||
def extract_keywords(text):
|
||||
"""Simple function to extract keywords from text."""
|
||||
keywords = re.findall(r'\b\w+\b', text.lower())
|
||||
return list(set(keywords))[:10] # Return the first 10 unique keywords
|
||||
|
||||
def scrape_oil_news():
|
||||
print("Scraping oil market news using Selenium...")
|
||||
|
||||
# Set up Selenium options
|
||||
options = Options()
|
||||
options.headless = True
|
||||
driver = webdriver.Firefox(options=options)
|
||||
|
||||
driver.get(OIL_NEWS_URL)
|
||||
|
||||
# Wait until 'categoryArticle' elements load
|
||||
try:
|
||||
WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
|
||||
)
|
||||
except Exception as e:
|
||||
print("Error: Content did not load properly.")
|
||||
driver.quit()
|
||||
return pd.DataFrame()
|
||||
|
||||
soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
driver.quit()
|
||||
|
||||
# Parse the articles
|
||||
articles = soup.find_all('div', class_='categoryArticle')
|
||||
news_data = []
|
||||
page_number = 1
|
||||
max_pages = 10 # Limit to 10 pages
|
||||
|
||||
print(f"Found {len(articles)} articles.")
|
||||
while page_number <= max_pages:
|
||||
# Load the page with pagination
|
||||
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
|
||||
|
||||
try:
|
||||
WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error: Content did not load properly on page {page_number}.")
|
||||
break
|
||||
|
||||
for i, article in enumerate(articles):
|
||||
# Extract the title, link, and date using the adjusted structure
|
||||
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
|
||||
link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
|
||||
date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
|
||||
soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
|
||||
articles = soup.find_all('div', class_='categoryArticle')
|
||||
if not articles:
|
||||
print(f"No articles found on page {page_number}. Ending pagination.")
|
||||
break
|
||||
|
||||
# Log each article's details for debugging
|
||||
print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}")
|
||||
for article in articles:
|
||||
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
|
||||
link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
|
||||
date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
|
||||
excerpt = article.find('p', class_='categoryArticle__excerpt').get_text(strip=True) if article.find('p', class_='categoryArticle__excerpt') else None
|
||||
author = date.split('|')[-1].strip() if '|' in date else "Unknown Author"
|
||||
timestamp = date.split('|')[0].strip() if '|' in date else date
|
||||
|
||||
if headline and link and date:
|
||||
news_data.append({
|
||||
'headline': headline,
|
||||
'link': link,
|
||||
'date': timestamp,
|
||||
'author': author,
|
||||
'excerpt': excerpt,
|
||||
'keywords': extract_keywords(headline + " " + excerpt if excerpt else headline),
|
||||
'sentiment_analysis': None # Placeholder for future sentiment analysis
|
||||
})
|
||||
|
||||
# Only add valid entries
|
||||
if headline and link and date:
|
||||
news_data.append({
|
||||
'headline': headline,
|
||||
'link': link, # Assuming the link is already a full URL
|
||||
'date': date
|
||||
})
|
||||
page_number += 1
|
||||
time.sleep(2)
|
||||
|
||||
df = pd.DataFrame(news_data)
|
||||
return df
|
||||
driver.quit()
|
||||
return news_data
|
||||
|
||||
def run_scraper():
|
||||
news_df = scrape_oil_news()
|
||||
file_path = os.path.join(DATA_DIR, 'oil_news.csv')
|
||||
|
||||
if not news_df.empty:
|
||||
news_df.to_csv(file_path, index=False)
|
||||
print(f"Oil news data saved to {file_path}")
|
||||
else:
|
||||
print("No data was scraped. The CSV file is empty.")
|
||||
file_path = os.path.join(DATA_DIR, 'oil_news.json')
|
||||
news_data = scrape_oil_news()
|
||||
save_to_json(news_data, file_path)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user