Trabajo-Terminal/02_cleaner.py at main · Leonelney/Trabajo-Terminal · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import spacy
from autocorrect import Speller
import requests
import json
import re
import datetime
import pandas as pd
import numpy as np
import credentials

def tokens_tweet(tweet):
    pattern = r'''(?x)           # set flag to allow verbose regexps
            (?:[A-Z]\.)+         # abbreviations, e.g. U.S.A.
            | \w+(?:-\w+)*       # words with optional internal hyphens
            | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
            | [][.,;"'?():-_`]   # these are separate tokens; includes ], [
    '''
    # obtenemos la lista de stopwords
    stop_words = stopwords.words('spanish')
    # Convertir todo el texto en minúsculas
    tweet = tweet.lower()
    # Remover menciones, hashtags, links y saltos de linea
    tweet = re.sub("@[A-Za-z0-9_]+","", tweet)
    tweet = re.sub("#\S+","", tweet)
    tweet = re.sub(r"http\S+", "", tweet)
    tweet = re.sub(r"www.\S+", "", tweet)
    tweet = re.sub(r'\n', '', tweet)
    # Tokenización
    tokens_tweet = nltk.regexp_tokenize(tweet, pattern)
    tokens_tweet = [token for token in tokens_tweet if len(token) > 1]
    # Remover los stopwords
    interesting_tokens = [w for w in tokens_tweet if not w in stop_words]
    # corregir palabras
    spell = Speller(lang='es')
    interesting_tokens = [spell(w) for w in interesting_tokens]
    # lematización
    nlp = spacy.load("es_core_news_md")
    doc = nlp(' '.join(interesting_tokens))
    interesting_tokens = [w.lemma_ for w in doc]

    return interesting_tokens


def mun_request(row, parameters):
    # utilizamos GoogleMaps API para determinar la ubicación correcta de los tweets extraídos por coordenadas
    if row['typeQuery'] == 'coordenadas':
        url = f"https://maps.googleapis.com/maps/api/geocode/json?latlng={row['latitude']},{row['longitude']}&key={credentials.GOOGLE_MAPS_KEY}"
        res = requests.get(url)
        elements = res.json()
        if  re.search("CDMX", elements['plus_code']['compound_code']):
            for i in elements['results']:
                if i['address_components'][0]['long_name'] in parameters["geo"].keys():
                    mun = i['address_components'][0]['long_name']
                    row['geoID'] = parameters["geo"][mun]["clave_alcaldia"]
                    row['geoName'] = mun
                    break
        else:
            row['geoID'] = 0
    return row

def whitout_acentos(text):
    return text.replace("á", "a").replace("é","e").replace("í","i").replace("ó","o").replace("ú","u")

def is_valid_tweet(tweet, topic, parameters):
    # convertimos el tweet a minúsculas
    tweet = tweet.lower()
    tweet = tweet.replace("á", "a").replace("é","e").replace("í","i").replace("ó","o").replace("ú","u")
    tweet = tweet.strip()

    # revisamos si el tweet contiene las palabras validas
    is_valid_contains = False
    for palabra in parameters["topics"][topic]["contains"]:
        if whitout_acentos(palabra) in tweet:
            is_valid_contains = is_valid_contains or True
        else:
            is_valid_contains = is_valid_contains or False

    # revisamos si el tweet contiene las palabras validas
    is_valid_not_contains = True
    for palabra in parameters["topics"][topic]["not_contains"]:
        if whitout_acentos(palabra) in tweet:
            is_valid_not_contains = is_valid_not_contains and False
        else:
            is_valid_not_contains = is_valid_not_contains and True

    return is_valid_contains and is_valid_not_contains

def log_df(df, year, month):
    # guardamos el registro de cuantos tweets quedaron por mes y año
    with open("./02_clean_tweets/01_log.txt", "a") as file:
        file.write(f'Se creo el archivo "tweets_{month}{year}" con {len(df)} registros: ')
        file.write(f'[pirotecnia: {len(df[df.topicQuery == "pirotecnia"])}, tránsito: {len(df[df.topicQuery == "tránsito"])}, incendio: {len(df[df.topicQuery == "incendio"])}]')
        file.write('\n')

def main():
    # abrimos el csv con los topics
    with open("./00_querys/topics.json") as file:
        parameters = json.load(file)
    # exploramos cada año
    for año in range(19,23):
        for mes in range (1,13):
            # abrimos un archivo de cada mes para hacer la limpieza
            if mes < 10:
                df = pd.read_csv(f'./01_tweets/tweets_0{mes}{año}.csv')
            else:
                df = pd.read_csv(f'./01_tweets/tweets_{mes}{año}.csv')
            # eliminamos registros repetidos
            df = df.drop_duplicates(['pubID'])
            # eliminar tweets que no cumplan con las palabras indicadas
            df["es_valido"] = df.apply(lambda x: is_valid_tweet(x["tweet"],x["topicQuery"],parameters), axis=1)
            df = df.drop(df[df["es_valido"] == False].index)
            df = df.drop(["es_valido"], axis=1)
            # obtener el municipio correcto de los registros extraídos por coordenadas
            df = df.apply(lambda x: mun_request(x,parameters), axis=1)
            # eliminar registros que están fuera de la CDMX
            df = df.drop(df[df['geoID'] == 0].index)
            # cambiaremos los tipos de datos bool a 0 y 1 para su uso en MySQL
            df['authorVerified'] = df['authorVerified'].apply(lambda x: 1 if x else 0)
            # eliminamos registros con posibles datos nulos de los campos importantes.
            cabeceras = list(df.columns)
            cabeceras.remove('likeCount')
            cabeceras.remove('replyCount')
            cabeceras.remove('retweetCount')
            cabeceras.remove('followersCount')
            cabeceras.remove('followingCount')
            for columna in cabeceras:
                df = df[df[columna].notna()]
            # modificamos las fechas de creación del tweet de acuerdo a los horarios de verano
            if año == 19:
                df['pubDate'] = df['pubDate'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') + datetime.timedelta(hours=1) \
                                                    if  datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') > \
                                                        datetime.datetime(2019,4,7,2) and \
                                                        datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') < \
                                                        datetime.datetime(2019,11,3,2) \
                                                    else x)
            elif año == 20:
                df['pubDate'] = df['pubDate'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') + datetime.timedelta(hours=1) \
                                                    if  datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') > \
                                                        datetime.datetime(2020,4,5,2) and \
                                                        datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') < \
                                                        datetime.datetime(2020,10,25,2) \
                                                    else x)
            elif año == 21:
                df['pubDate'] = df['pubDate'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') + datetime.timedelta(hours=1) \
                                                    if  datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') > \
                                                        datetime.datetime(2021,4,7,2) and \
                                                        datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') < \
                                                        datetime.datetime(2021,10,31,2) \
                                                    else x)
            elif año == 22:
                df['pubDate'] = df['pubDate'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') + datetime.timedelta(hours=1) \
                                                    if  datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') > \
                                                        datetime.datetime(2022,4,3,2) and \
                                                        datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') < \
                                                        datetime.datetime(2022,10,30,2) \
                                                    else x)
            df['pubYear'] = df['pubDate'].apply(lambda x: datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S').strftime('%Y'))
            df['pubMonth'] = df['pubDate'].apply(lambda x: datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S').strftime('%m'))
            df['pubDay'] = df['pubDate'].apply(lambda x: datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S').strftime('%d'))
            df['pubHour'] = df['pubDate'].apply(lambda x: datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S').strftime('%H'))
            df['pubMinute'] = df['pubDate'].apply(lambda x: datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S').strftime('%M'))
            # ordenar por fecha
            df['pubDate'] = pd.to_datetime(df['pubDate'])
            df = df.sort_values(by='pubDate')
            # creamos los tokens de cada tweet
            if "tokens" in df.columns:
                df['tokens'] = df['tweet'].apply(tokens_tweet)
            else:
                df.insert(4, "tokens", df['tweet'].apply(tokens_tweet))
            # crear lista de hashtags
            if "hashtags" in df.columns:
                df["hashtags"] = df['tweet'].apply(lambda x: re.findall("\B#([\w-]+)", x))
            else:
                df.insert(5, "hashtags", df['tweet'].apply(lambda x: re.findall("\B#([\w-]+)", x)))
            # crear lista de menciones
            if "mentions" in df.columns:
                df["mentions"] = df['tweet'].apply(lambda x: re.findall("\B@([\w-]+)", x))
            else:
                df.insert(6, "mentions", df['tweet'].apply(lambda x: re.findall("\B@([\w-]+)", x)))
            # guardar dataset limpio
            if mes < 10:
                df.to_csv(f'./02_clean_tweets/clean_tweets_0{mes}{año}.csv', index=False)
                log_df(df, año, f'0{mes}')
            else:
                df.to_csv(f'./02_clean_tweets/clean_tweets_{mes}{año}.csv', index=False)
                log_df(df, año, mes)


if __name__ == '__main__':
    main()