Added data-collection

This commit is contained in:
klein panic
2024-10-22 14:47:39 -04:00
parent 64bd498052
commit b2bab7c5a7
7 changed files with 82 additions and 0 deletions

1
Data-Collection/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
venv/

View File

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,14 @@
# main.py
import scrapers.oil_news_scraper as oil_news
def main():
print("Starting oil data collection...")
# Run oil market news scraper
oil_news.run_scraper()
print("Oil news data scraping completed.")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,66 @@
# scrapers/oil_news_scraper.py
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
# URL for OilPrice.com homepage
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
# Define the directory to store the scraped data
DATA_DIR = os.path.join(os.getcwd(), "data")
if not os.path.exists(DATA_DIR):
os.makedirs(DATA_DIR)
# Function to scrape news headlines from OilPrice.com
def scrape_oil_news():
print("Scraping oil market news...")
# Send an HTTP request to the website
response = requests.get(OIL_NEWS_URL)
response.raise_for_status()
# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
# Find all news article containers (class names updated)
articles = soup.find_all('div', class_='categoryArticle')
# List to store the scraped data
news_data = []
# Loop through each article container
for article in articles:
# Extract the headline, date, and link
headline = article.find('a').get_text(strip=True) if article.find('a') else None
link = article.find('a')['href'] if article.find('a') else None
date = article.find('span', class_='categoryArticle__date').get_text(strip=True) if article.find('span', class_='categoryArticle__date') else None
# Only append valid data
if headline and link and date:
news_data.append({
'headline': headline,
'link': f"https://oilprice.com{link}",
'date': date
})
# Convert the list into a pandas DataFrame
df = pd.DataFrame(news_data)
return df
# Function to run the scraper and save data
def run_scraper():
# Scrape oil news
news_df = scrape_oil_news()
# Define the file path for saving the data
file_path = os.path.join(DATA_DIR, 'oil_news.csv')
# Save the DataFrame to a CSV file
if not news_df.empty:
news_df.to_csv(file_path, index=False)
print(f"Oil news data saved to {file_path}")
else:
print("No data was scraped. The CSV file is empty.")

View File