xtesting-backend/merge_csvs.py at main · Aaravkataria24/xtesting-backend · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pandas as pd

usernames = ["TheCryptoLark", "JupiterExchange", "weremeow", "SOCKETProtocol", "litocoen", "3orovik", "aeyakovenko", "lrettig", "musalbas", "jon_charb", "avsa", "adamscochran", "koeppelmann", "0xCygaar", "cryptunez", "BullyEsq", "solana", "phantom", "ethereum"]

search_queries = ["chain%20abstraction", "interop", "rollup", "solana", "trenches", "multi-chain", "dApp", "onchain", "web3", "defi", "nft", "gamefi", "socialfi", "dao", "wallet", "staking", "bridging", "L2"]

# First, try to load the existing tweets_all_users.csv
try:
    existing_df = pd.read_csv("tweets_all_users.csv")
    print(f"✅ Loaded existing tweets_all_users.csv with {len(existing_df)} rows")
except FileNotFoundError:
    print("⚠️ No existing tweets_all_users.csv found. Creating new file.")
    existing_df = pd.DataFrame()

all_dfs = []

# Process username files
for username in usernames:
    file_name = f"tweets_{username}.csv"
    try:
        df = pd.read_csv(file_name)
        if not df.empty:
            all_dfs.append(df)
            print(f"✅ Loaded {file_name} with {len(df)} rows")
        else:
            print(f"⚠️ Empty file: {file_name}, skipping...")
    except (FileNotFoundError, pd.errors.EmptyDataError):
        print(f"⚠️ File not found or empty: {file_name}, skipping...")

# Process search query files
for search_query in search_queries:
    file_name = f"tweets_search_query_{search_query}.csv"
    try:
        df = pd.read_csv(file_name)
        if not df.empty:
            all_dfs.append(df)
            print(f"✅ Loaded {file_name} with {len(df)} rows")
        else:
            print(f"⚠️ Empty file: {file_name}, skipping...")
    except (FileNotFoundError, pd.errors.EmptyDataError):
        print(f"⚠️ File not found or empty: {file_name}, skipping...")

# Combine all new dataframes
if all_dfs:
    new_df = pd.concat(all_dfs, ignore_index=True)

    # Remove duplicates based on content
    if not existing_df.empty:
        # Get unique tweets from new data that don't exist in the old data
        unique_new_tweets = new_df[~new_df['content'].isin(existing_df['content'])]
        print(f"\n📊 Found {len(unique_new_tweets)} new unique tweets to add")

        # Combine existing and new unique tweets
        combined_df = pd.concat([existing_df, unique_new_tweets], ignore_index=True)
    else:
        combined_df = new_df

    # Save the combined result
    combined_df.to_csv("tweets_all_users.csv", index=False)
    print(f"\n✅ Combined CSV saved as tweets_all_users.csv with {len(combined_df)} rows")
else:
    print("❌ No new files loaded. Check filenames.")