sentiment_data_sets/step-01-process_tweets.py at master · seandearnaley/sentiment_data_sets · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""
This script processes the tweets dataset and saves the output to a new CSV file.

https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment
"""

import csv
import json


def process_file(file_path):
    output_data = []
    sentiment_mapping = {"positive": 1.0, "neutral": 0.0, "negative": -1.0}

    with open(file_path, "r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            sentence = row["text"]
            sentiment = row["airline_sentiment"]
            confidence = float(row["airline_sentiment_confidence"])
            numeric_sentiment = sentiment_mapping[sentiment]
            reasoning = (
                f"The sentiment is {sentiment} based on the content of the tweet."
            )
            output_item = {
                "reasoning": reasoning,
                "sentiment": round(numeric_sentiment, 2),
                "confidence": round(confidence, 2),
            }
            output_data.append((sentence, json.dumps(output_item)))

    return output_data


# Set the file path for the new dataset
file_path = "data/inputs/airline_tweaks/Tweets.csv"

# Process the new dataset file
output_data = process_file(file_path)

# Save the output data to a new CSV file
output_file = "data/outputs/Processed_Tweets_output.csv"
with open(output_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Sentence", "JSON"])
    writer.writerows(output_data)