diff --git a/Data-Collection/.gitignore b/Data-Collection/.gitignore new file mode 100644 index 0000000..f7275bb --- /dev/null +++ b/Data-Collection/.gitignore @@ -0,0 +1 @@ +venv/ diff --git a/Data-Collection/WebScraper/README.md b/Data-Collection/WebScraper/README.md new file mode 100644 index 0000000..e69de29 diff --git a/Data-Collection/WebScraper/data/oil_news.csv b/Data-Collection/WebScraper/data/oil_news.csv new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/Data-Collection/WebScraper/data/oil_news.csv @@ -0,0 +1 @@ + diff --git a/Data-Collection/WebScraper/main.py b/Data-Collection/WebScraper/main.py new file mode 100644 index 0000000..cb5ad05 --- /dev/null +++ b/Data-Collection/WebScraper/main.py @@ -0,0 +1,14 @@ +# main.py +import scrapers.oil_news_scraper as oil_news + +def main(): + print("Starting oil data collection...") + + # Run oil market news scraper + oil_news.run_scraper() + + print("Oil news data scraping completed.") + +if __name__ == "__main__": + main() + diff --git a/Data-Collection/WebScraper/scrapers/__pycache__/oil_news_scraper.cpython-311.pyc b/Data-Collection/WebScraper/scrapers/__pycache__/oil_news_scraper.cpython-311.pyc new file mode 100644 index 0000000..cfde48a Binary files /dev/null and b/Data-Collection/WebScraper/scrapers/__pycache__/oil_news_scraper.cpython-311.pyc differ diff --git a/Data-Collection/WebScraper/scrapers/oil_news_scraper.py b/Data-Collection/WebScraper/scrapers/oil_news_scraper.py new file mode 100644 index 0000000..e0f4f31 --- /dev/null +++ b/Data-Collection/WebScraper/scrapers/oil_news_scraper.py @@ -0,0 +1,66 @@ +# scrapers/oil_news_scraper.py + +import requests +from bs4 import BeautifulSoup +import pandas as pd +import os + +# URL for OilPrice.com homepage +OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/" + +# Define the directory to store the scraped data +DATA_DIR = os.path.join(os.getcwd(), "data") +if not os.path.exists(DATA_DIR): + os.makedirs(DATA_DIR) + +# Function to scrape news headlines from OilPrice.com +def scrape_oil_news(): + print("Scraping oil market news...") + + # Send an HTTP request to the website + response = requests.get(OIL_NEWS_URL) + response.raise_for_status() + + # Parse the HTML using BeautifulSoup + soup = BeautifulSoup(response.text, "html.parser") + + # Find all news article containers (class names updated) + articles = soup.find_all('div', class_='categoryArticle') + + # List to store the scraped data + news_data = [] + + # Loop through each article container + for article in articles: + # Extract the headline, date, and link + headline = article.find('a').get_text(strip=True) if article.find('a') else None + link = article.find('a')['href'] if article.find('a') else None + date = article.find('span', class_='categoryArticle__date').get_text(strip=True) if article.find('span', class_='categoryArticle__date') else None + + # Only append valid data + if headline and link and date: + news_data.append({ + 'headline': headline, + 'link': f"https://oilprice.com{link}", + 'date': date + }) + + # Convert the list into a pandas DataFrame + df = pd.DataFrame(news_data) + + return df + +# Function to run the scraper and save data +def run_scraper(): + # Scrape oil news + news_df = scrape_oil_news() + + # Define the file path for saving the data + file_path = os.path.join(DATA_DIR, 'oil_news.csv') + + # Save the DataFrame to a CSV file + if not news_df.empty: + news_df.to_csv(file_path, index=False) + print(f"Oil news data saved to {file_path}") + else: + print("No data was scraped. The CSV file is empty.") diff --git a/Data-Collection/WebScraper/setup.py b/Data-Collection/WebScraper/setup.py new file mode 100644 index 0000000..e69de29