oil scraper works
This commit is contained in:
Binary file not shown.
@@ -0,0 +1,76 @@
|
|||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.firefox.options import Options
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
|
||||||
|
# URL for OilPrice.com homepage
|
||||||
|
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
|
||||||
|
|
||||||
|
# Set up the data directory
|
||||||
|
DATA_DIR = os.path.join(os.getcwd(), "data")
|
||||||
|
if not os.path.exists(DATA_DIR):
|
||||||
|
os.makedirs(DATA_DIR)
|
||||||
|
|
||||||
|
def scrape_oil_news():
|
||||||
|
print("Scraping oil market news using Selenium...")
|
||||||
|
|
||||||
|
# Set up Selenium options
|
||||||
|
options = Options()
|
||||||
|
options.headless = True
|
||||||
|
driver = webdriver.Firefox(options=options)
|
||||||
|
|
||||||
|
driver.get(OIL_NEWS_URL)
|
||||||
|
|
||||||
|
# Wait until 'categoryArticle' elements load
|
||||||
|
try:
|
||||||
|
WebDriverWait(driver, 20).until(
|
||||||
|
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print("Error: Content did not load properly.")
|
||||||
|
driver.quit()
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
# Parse the articles
|
||||||
|
articles = soup.find_all('div', class_='categoryArticle')
|
||||||
|
news_data = []
|
||||||
|
|
||||||
|
print(f"Found {len(articles)} articles.")
|
||||||
|
|
||||||
|
for i, article in enumerate(articles):
|
||||||
|
# Extract the title, link, and date using the adjusted structure
|
||||||
|
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
|
||||||
|
link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
|
||||||
|
date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
|
||||||
|
|
||||||
|
# Log each article's details for debugging
|
||||||
|
print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}")
|
||||||
|
|
||||||
|
# Only add valid entries
|
||||||
|
if headline and link and date:
|
||||||
|
news_data.append({
|
||||||
|
'headline': headline,
|
||||||
|
'link': link, # Assuming the link is already a full URL
|
||||||
|
'date': date
|
||||||
|
})
|
||||||
|
|
||||||
|
df = pd.DataFrame(news_data)
|
||||||
|
return df
|
||||||
|
|
||||||
|
def run_scraper():
|
||||||
|
news_df = scrape_oil_news()
|
||||||
|
file_path = os.path.join(DATA_DIR, 'oil_news.csv')
|
||||||
|
|
||||||
|
if not news_df.empty:
|
||||||
|
news_df.to_csv(file_path, index=False)
|
||||||
|
print(f"Oil news data saved to {file_path}")
|
||||||
|
else:
|
||||||
|
print("No data was scraped. The CSV file is empty.")
|
||||||
|
|
||||||
@@ -1,6 +1,8 @@
|
|||||||
# scrapers/oil_news_scraper.py
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.firefox.options import Options
|
||||||
import requests
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
@@ -8,60 +10,67 @@ import os
|
|||||||
# URL for OilPrice.com homepage
|
# URL for OilPrice.com homepage
|
||||||
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
|
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
|
||||||
|
|
||||||
# Define the directory to store the scraped data
|
# Set up the data directory
|
||||||
DATA_DIR = os.path.join(os.getcwd(), "data")
|
DATA_DIR = os.path.join(os.getcwd(), "data")
|
||||||
if not os.path.exists(DATA_DIR):
|
if not os.path.exists(DATA_DIR):
|
||||||
os.makedirs(DATA_DIR)
|
os.makedirs(DATA_DIR)
|
||||||
|
|
||||||
# Function to scrape news headlines from OilPrice.com
|
|
||||||
def scrape_oil_news():
|
def scrape_oil_news():
|
||||||
print("Scraping oil market news...")
|
print("Scraping oil market news using Selenium...")
|
||||||
|
|
||||||
# Send an HTTP request to the website
|
# Set up Selenium options
|
||||||
response = requests.get(OIL_NEWS_URL)
|
options = Options()
|
||||||
response.raise_for_status()
|
options.headless = True
|
||||||
|
driver = webdriver.Firefox(options=options)
|
||||||
|
|
||||||
# Print the HTML to see what we are working with
|
driver.get(OIL_NEWS_URL)
|
||||||
print(response.text[:1000]) # Print only the first 1000 characters for brevity
|
|
||||||
|
|
||||||
# Parse the HTML using BeautifulSoup
|
# Wait until 'categoryArticle' elements load
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
try:
|
||||||
|
WebDriverWait(driver, 20).until(
|
||||||
|
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print("Error: Content did not load properly.")
|
||||||
|
driver.quit()
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
# Find all news article containers (class names updated)
|
soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
# Parse the articles
|
||||||
articles = soup.find_all('div', class_='categoryArticle')
|
articles = soup.find_all('div', class_='categoryArticle')
|
||||||
|
|
||||||
# List to store the scraped data
|
|
||||||
news_data = []
|
news_data = []
|
||||||
|
|
||||||
# Loop through each article container
|
print(f"Found {len(articles)} articles.")
|
||||||
for article in articles:
|
|
||||||
# Extract the headline, date, and link
|
|
||||||
headline = article.find('a').get_text(strip=True) if article.find('a') else None
|
|
||||||
link = article.find('a')['href'] if article.find('a') else None
|
|
||||||
date = article.find('span', class_='categoryArticle__date').get_text(strip=True) if article.find('span', class_='categoryArticle__date') else None
|
|
||||||
|
|
||||||
# Only append valid data
|
for i, article in enumerate(articles):
|
||||||
|
# Extract the title, link, and date using the adjusted structure
|
||||||
|
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
|
||||||
|
link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
|
||||||
|
date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
|
||||||
|
|
||||||
|
# Log each article's details for debugging
|
||||||
|
print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}")
|
||||||
|
|
||||||
|
# Only add valid entries
|
||||||
if headline and link and date:
|
if headline and link and date:
|
||||||
news_data.append({
|
news_data.append({
|
||||||
'headline': headline,
|
'headline': headline,
|
||||||
'link': f"https://oilprice.com{link}",
|
'link': link, # Assuming the link is already a full URL
|
||||||
'date': date
|
'date': date
|
||||||
})
|
})
|
||||||
|
|
||||||
df = pd.DataFrame(news_data)
|
df = pd.DataFrame(news_data)
|
||||||
return df
|
return df
|
||||||
|
|
||||||
# Function to run the scraper and save data
|
|
||||||
def run_scraper():
|
def run_scraper():
|
||||||
# Scrape oil news
|
|
||||||
news_df = scrape_oil_news()
|
news_df = scrape_oil_news()
|
||||||
|
|
||||||
# Define the file path for saving the data
|
|
||||||
file_path = os.path.join(DATA_DIR, 'oil_news.csv')
|
file_path = os.path.join(DATA_DIR, 'oil_news.csv')
|
||||||
|
|
||||||
# Save the DataFrame to a CSV file
|
|
||||||
if not news_df.empty:
|
if not news_df.empty:
|
||||||
news_df.to_csv(file_path, index=False)
|
news_df.to_csv(file_path, index=False)
|
||||||
print(f"Oil news data saved to {file_path}")
|
print(f"Oil news data saved to {file_path}")
|
||||||
else:
|
else:
|
||||||
print("No data was scraped. The CSV file is empty.")
|
print("No data was scraped. The CSV file is empty.")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user