Instagram-Wrapped/helper_function.py at main · AdamJeddy/Instagram-Wrapped · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
import json
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
import emoji
import unicodedata
from unidecode import unidecode
from colorama import Fore, Style
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize

### ~~~~~~~~~~~ Functions ~~~~~~~~ ###

# Function to load JSON data from a file
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

# Function to extract messages from the JSON data
def extract_messages(data):
    messages = data.get('messages', [])
    return messages

# Function to convert the JSON data into a Pandas DataFrame
def text_to_emoji(text):
    return text.encode('latin1').decode('utf-8')

# Function to fix encoding issues
def fix_encoding(text):
    if isinstance(text, str):
        try:
            # Try UTF-8 decoding first
            return text.encode('latin1').decode('utf-8')
        except UnicodeEncodeError:
            try:
                # If it fails, try the next encoding
                return text.encode('iso-8859-1').decode('utf-8')
            except UnicodeEncodeError:
                # If both fail, return the original text
                return text
    else:
        return text

# Function to correct encoding
def correct_encoding(text):
    try:
        return text.encode('latin1').decode('utf-8')
    except Exception as e:
        # Return the original text if there's an error
        return text

# Function to check if text contains non-Latin characters
def contains_non_latin(text):
    return re.search(r'[^\p{Latin}\p{Common}]', text) is not None

# Function to normalize text
def normalize_text(text):
    try:
        if not contains_non_latin(text):
            return unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
        else:
            return text
    except Exception as e:
        # Return the original text if there's an error
        return text

# Function to check if 'share' contains a link with 'reel'
def contains_reel_link(share_data):
    if isinstance(share_data, dict) and 'link' in share_data:
        return 1 if 'reel' in share_data['link'] else 0
    return 0

# Function to load all the messages
def load_messages(root_directory, columns_to_drop, users_to_drop):
    # Initialize an empty DataFrame to store all messages
    all_messages_df = pd.DataFrame()

    # Counter for tracking the number of users
    user_count = 0

    # Iterate through user folders
    for user_folder in os.listdir(root_directory):
        user_folder_path = os.path.join(root_directory, user_folder)

        # Check if the item in the directory is a folder
        if os.path.isdir(user_folder_path):

            user_count += 1

            # Iterate through JSON files in the user's folder
            for json_file in os.listdir(user_folder_path):
                if json_file.endswith('.json'):
                    json_file_path = os.path.join(user_folder_path, json_file)

                    # Load JSON data
                    data = load_json(json_file_path)

                    # Extract messages
                    messages = extract_messages(data)

                    # Create a Pandas DataFrame from the messages
                    df = pd.DataFrame(messages)

                    # Add a column for the user's name
                    df['user'] = user_folder.rsplit('_', 1)[0]

                    # Concatenate the DataFrame to the overall DataFrame
                    all_messages_df = pd.concat([all_messages_df, df])

    # Drop the specified columns
    all_messages_df = all_messages_df.drop(columns=columns_to_drop, errors='ignore')

    # Drop rows where 'sender_name' is in the list of users to drop or where 'sender_name' is "Instagram User" or 'user' is "instagramuser"
    all_messages_df = all_messages_df[
        ~(all_messages_df['user'].isin(users_to_drop) |
        (all_messages_df['sender_name'] == "Instagram User") |
        (all_messages_df['user'] == "instagramuser")
        )
    ]


    # Extract only the 'reaction' from the 'reactions' column and use text_to_emoji() to convert it to emoji
    all_messages_df['reactions'] = all_messages_df['reactions'].apply(lambda x: x[0]['reaction'].encode('latin1').decode('utf-8') if isinstance(x, list) and len(x) > 0 else None)


    # Convert the 'timestamp_ms' column to datetime format
    all_messages_df['timestamp'] = pd.to_datetime(all_messages_df['timestamp_ms'], unit='ms')

    # Create new columns for year, month, day, and time
    all_messages_df['year'] = all_messages_df['timestamp'].dt.year
    all_messages_df['month'] = all_messages_df['timestamp'].dt.month
    all_messages_df['day'] = all_messages_df['timestamp'].dt.day
    all_messages_df['time'] = all_messages_df['timestamp'].dt.strftime('%H:%M')
    all_messages_df['hour'] = all_messages_df['timestamp'].dt.hour
    all_messages_df['minute'] = all_messages_df['timestamp'].dt.minute


    # Regex pattern to match werid characters into readbale text
    all_messages_df['sender_name'] = all_messages_df['sender_name'].apply(correct_encoding)

    # Apply the function to the 'sender_name' column
    all_messages_df['sender_name'] = all_messages_df['sender_name'].apply(normalize_text)

    # Apply the fix_encoding function to the 'content' column
    all_messages_df['content'] = all_messages_df['content'].apply(fix_encoding)


    # Drop rows where 'content' column contains the specified text
    all_messages_df = all_messages_df[~all_messages_df['content'].str.contains("تم التفاعل باستخدام", na=False)]
    all_messages_df = all_messages_df[~all_messages_df['content'].str.contains("هامت رسالة على الإعجاب", na=False)]

    # Apply the function to create the 'Reel' column
    all_messages_df['reel'] = all_messages_df['share'].apply(contains_reel_link)

    # Create a new column for word count
    all_messages_df['word_count'] = all_messages_df['content'].astype(str).apply(lambda x: len(x.split()))

    return all_messages_df, user_count