Added data-collection

2024-10-22 14:47:39 -04:00
parent 64bd498052
commit b2bab7c5a7
7 changed files with 82 additions and 0 deletions
--- a/Data-Collection/.gitignore
+++ b/Data-Collection/.gitignore
@@ -0,0 +1 @@
+venv/
--- a/Data-Collection/WebScraper/README.md
+++ b/Data-Collection/WebScraper/README.md
--- a/Data-Collection/WebScraper/data/oil_news.csv
+++ b/Data-Collection/WebScraper/data/oil_news.csv
@@ -0,0 +1 @@
+
--- a/Data-Collection/WebScraper/main.py
+++ b/Data-Collection/WebScraper/main.py
@@ -0,0 +1,14 @@
+# main.py
+import scrapers.oil_news_scraper as oil_news
+
+def main():
+    print("Starting oil data collection...")
+
+    # Run oil market news scraper
+    oil_news.run_scraper()
+
+    print("Oil news data scraping completed.")
+
+if __name__ == "__main__":
+    main()
+
--- a/Data-Collection/WebScraper/scrapers/pycache/oil_news_scraper.cpython-311.pyc
+++ b/Data-Collection/WebScraper/scrapers/pycache/oil_news_scraper.cpython-311.pyc
--- a/Data-Collection/WebScraper/scrapers/oil_news_scraper.py
+++ b/Data-Collection/WebScraper/scrapers/oil_news_scraper.py
@@ -0,0 +1,66 @@
+# scrapers/oil_news_scraper.py
+
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import os
+
+# URL for OilPrice.com homepage
+OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
+
+# Define the directory to store the scraped data
+DATA_DIR = os.path.join(os.getcwd(), "data")
+if not os.path.exists(DATA_DIR):
+    os.makedirs(DATA_DIR)
+
+# Function to scrape news headlines from OilPrice.com
+def scrape_oil_news():
+    print("Scraping oil market news...")
+
+    # Send an HTTP request to the website
+    response = requests.get(OIL_NEWS_URL)
+    response.raise_for_status()
+
+    # Parse the HTML using BeautifulSoup
+    soup = BeautifulSoup(response.text, "html.parser")
+
+    # Find all news article containers (class names updated)
+    articles = soup.find_all('div', class_='categoryArticle')
+
+    # List to store the scraped data
+    news_data = []
+
+    # Loop through each article container
+    for article in articles:
+        # Extract the headline, date, and link
+        headline = article.find('a').get_text(strip=True) if article.find('a') else None
+        link = article.find('a')['href'] if article.find('a') else None
+        date = article.find('span', class_='categoryArticle__date').get_text(strip=True) if article.find('span', class_='categoryArticle__date') else None
+
+        # Only append valid data
+        if headline and link and date:
+            news_data.append({
+                'headline': headline,
+                'link': f"https://oilprice.com{link}",
+                'date': date
+            })
+
+    # Convert the list into a pandas DataFrame
+    df = pd.DataFrame(news_data)
+
+    return df
+
+# Function to run the scraper and save data
+def run_scraper():
+    # Scrape oil news
+    news_df = scrape_oil_news()
+
+    # Define the file path for saving the data
+    file_path = os.path.join(DATA_DIR, 'oil_news.csv')
+
+    # Save the DataFrame to a CSV file
+    if not news_df.empty:
+        news_df.to_csv(file_path, index=False)
+        print(f"Oil news data saved to {file_path}")
+    else:
+        print("No data was scraped. The CSV file is empty.")
--- a/Data-Collection/WebScraper/setup.py
+++ b/Data-Collection/WebScraper/setup.py