Added data-collection
This commit is contained in:
1
Data-Collection/.gitignore
vendored
Normal file
1
Data-Collection/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
venv/
|
||||
0
Data-Collection/WebScraper/README.md
Normal file
0
Data-Collection/WebScraper/README.md
Normal file
1
Data-Collection/WebScraper/data/oil_news.csv
Normal file
1
Data-Collection/WebScraper/data/oil_news.csv
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
|
|
14
Data-Collection/WebScraper/main.py
Normal file
14
Data-Collection/WebScraper/main.py
Normal file
@@ -0,0 +1,14 @@
|
||||
# main.py
|
||||
import scrapers.oil_news_scraper as oil_news
|
||||
|
||||
def main():
|
||||
print("Starting oil data collection...")
|
||||
|
||||
# Run oil market news scraper
|
||||
oil_news.run_scraper()
|
||||
|
||||
print("Oil news data scraping completed.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Binary file not shown.
66
Data-Collection/WebScraper/scrapers/oil_news_scraper.py
Normal file
66
Data-Collection/WebScraper/scrapers/oil_news_scraper.py
Normal file
@@ -0,0 +1,66 @@
|
||||
# scrapers/oil_news_scraper.py
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
# URL for OilPrice.com homepage
|
||||
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
|
||||
|
||||
# Define the directory to store the scraped data
|
||||
DATA_DIR = os.path.join(os.getcwd(), "data")
|
||||
if not os.path.exists(DATA_DIR):
|
||||
os.makedirs(DATA_DIR)
|
||||
|
||||
# Function to scrape news headlines from OilPrice.com
|
||||
def scrape_oil_news():
|
||||
print("Scraping oil market news...")
|
||||
|
||||
# Send an HTTP request to the website
|
||||
response = requests.get(OIL_NEWS_URL)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse the HTML using BeautifulSoup
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# Find all news article containers (class names updated)
|
||||
articles = soup.find_all('div', class_='categoryArticle')
|
||||
|
||||
# List to store the scraped data
|
||||
news_data = []
|
||||
|
||||
# Loop through each article container
|
||||
for article in articles:
|
||||
# Extract the headline, date, and link
|
||||
headline = article.find('a').get_text(strip=True) if article.find('a') else None
|
||||
link = article.find('a')['href'] if article.find('a') else None
|
||||
date = article.find('span', class_='categoryArticle__date').get_text(strip=True) if article.find('span', class_='categoryArticle__date') else None
|
||||
|
||||
# Only append valid data
|
||||
if headline and link and date:
|
||||
news_data.append({
|
||||
'headline': headline,
|
||||
'link': f"https://oilprice.com{link}",
|
||||
'date': date
|
||||
})
|
||||
|
||||
# Convert the list into a pandas DataFrame
|
||||
df = pd.DataFrame(news_data)
|
||||
|
||||
return df
|
||||
|
||||
# Function to run the scraper and save data
|
||||
def run_scraper():
|
||||
# Scrape oil news
|
||||
news_df = scrape_oil_news()
|
||||
|
||||
# Define the file path for saving the data
|
||||
file_path = os.path.join(DATA_DIR, 'oil_news.csv')
|
||||
|
||||
# Save the DataFrame to a CSV file
|
||||
if not news_df.empty:
|
||||
news_df.to_csv(file_path, index=False)
|
||||
print(f"Oil news data saved to {file_path}")
|
||||
else:
|
||||
print("No data was scraped. The CSV file is empty.")
|
||||
0
Data-Collection/WebScraper/setup.py
Normal file
0
Data-Collection/WebScraper/setup.py
Normal file
Reference in New Issue
Block a user