Code
= df['date'].min()
min_date
= df['date'].max()
max_date
print(f"\nPeriodo de tweets recolectados: {min_date} / {max_date}\n")
Periodo de tweets recolectados: 2011-06-14 20:50:03-05:00 / 2023-03-01 12:22:56-05:00
Información general sobre la base de datos
Periodo de tweets recolectados: 2011-06-14 20:50:03-05:00 / 2023-03-01 12:22:56-05:00
<class 'pandas.core.frame.DataFrame'>
Index: 7830 entries, 171420 to 179249
Data columns (total 63 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 query 7830 non-null object
1 id 7830 non-null float64
2 timestamp_utc 7830 non-null int64
3 local_time 7830 non-null object
4 user_screen_name 7830 non-null object
5 text 7830 non-null object
6 possibly_sensitive 4135 non-null object
7 retweet_count 7830 non-null float64
8 like_count 7830 non-null float64
9 reply_count 7830 non-null float64
10 impression_count 16 non-null object
11 lang 7830 non-null object
12 to_username 1568 non-null object
13 to_userid 1568 non-null float64
14 to_tweetid 1281 non-null float64
15 source_name 7830 non-null object
16 source_url 7830 non-null object
17 user_location 7830 non-null object
18 lat 4 non-null object
19 lng 4 non-null object
20 user_id 7830 non-null object
21 user_name 7830 non-null object
22 user_verified 7830 non-null float64
23 user_description 7830 non-null object
24 user_url 7830 non-null object
25 user_image 7830 non-null object
26 user_tweets 7830 non-null object
27 user_followers 7830 non-null float64
28 user_friends 7830 non-null object
29 user_likes 7830 non-null float64
30 user_lists 7830 non-null float64
31 user_created_at 7830 non-null object
32 user_timestamp_utc 7830 non-null float64
33 collected_via 7830 non-null object
34 match_query 7830 non-null float64
35 retweeted_id 0 non-null float64
36 retweeted_user 0 non-null float64
37 retweeted_user_id 0 non-null float64
38 retweeted_timestamp_utc 0 non-null object
39 quoted_id 293 non-null object
40 quoted_user 293 non-null object
41 quoted_user_id 293 non-null float64
42 quoted_timestamp_utc 293 non-null float64
43 collection_time 7830 non-null object
44 url 7830 non-null object
45 place_country_code 265 non-null object
46 place_name 265 non-null object
47 place_type 265 non-null object
48 place_coordinates 265 non-null object
49 links 2904 non-null object
50 domains 2904 non-null object
51 media_urls 1533 non-null object
52 media_files 1533 non-null object
53 media_types 1533 non-null object
54 media_alt_texts 47 non-null object
55 mentioned_names 2767 non-null object
56 mentioned_ids 2613 non-null object
57 hashtags 4969 non-null object
58 intervention_type 0 non-null float64
59 intervention_text 0 non-null float64
60 intervention_url 0 non-null float64
61 country 7830 non-null object
62 date 7830 non-null datetime64[ns, America/Bogota]
dtypes: datetime64[ns, America/Bogota](1), float64(20), int64(1), object(41)
memory usage: 3.8+ MB
Lista del top 20 de otros sitios web mencionados en los tweets y su frecuencia
domains
fb.me 1231
bit.ly 242
unidosporlavida.com 193
facebook.com 171
instagram.com 125
sumall.com 98
youtube.com 68
lifenews.com 40
citizengo.org 36
youtu.be 33
20ft.net 33
aciprensa.com 19
votocatolico.co 18
actuall.com 15
shar.es 15
liveactionnews.org 15
twitter.com 12
es.gaudiumpress.org 12
religionenlibertad.com 10
razonmasfe.com 8
Name: count, dtype: int64
Lista del top 20 de hashtags más usados y su frecuencia
# convert dataframe column to list
hashtags = df['hashtags'].to_list()
# remove nan items from list
hashtags = [x for x in hashtags if not pd.isna(x)]
# split items into a list based on a delimiter
hashtags = [x.split('|') for x in hashtags]
# flatten list of lists
hashtags = [item for sublist in hashtags for item in sublist]
# count items on list
hashtags_count = pd.Series(hashtags).value_counts()
# return first n rows in descending order
top_hashtags = hashtags_count.nlargest(20)
top_hashtags
sialavida 647
aborto 416
9marchaxlavida 373
noalaborto 325
colombiaesprovida 295
eutanasia 157
procuradorordóñez 139
sialprocurador 138
yosoyprovida 135
soyprovida 108
negocio 106
repost 100
todavidaimporta 100
elijolas2vidas 98
colombia 93
eutanasiano 91
abortocero 91
fiestaxlavida 91
4mayo7marchaporlavida 89
caravanaporlavida 88
Name: count, dtype: int64
Top 20 de usuarios más mencionados en los tweets
# filter column from dataframe
users = df['mentioned_names'].to_list()
# remove nan items from list
users = [x for x in users if not pd.isna(x)]
# split items into a list based on a delimiter
users = [x.split('|') for x in users]
# flatten list of lists
users = [item for sublist in users for item in sublist]
# count items on list
users_count = pd.Series(users).value_counts()
# return first n rows in descending order
top_users = users_count.nlargest(20)
top_users
marceposada 196
colombiaprovida 194
cconstitucional 176
monicaroa 173
sialprocurador 106
unidosxlavidaco 105
noticiasrcn 83
7marcofidelr 62
amadarosa 59
referendoxvida 51
colombiaderecha 49
profamiliacol 48
oea_oficial 47
comisionprimera 42
camaracolombia 40
lam_vero 36
wradiocolombia 35
unidosxlavida 35
yosoyprovida 34
aciprensa 32
Name: count, dtype: int64
Lista del top 20 de los tokens más comunes y su frecuencia
# load the spacy model for Spanish
nlp = spacy.load("es_core_news_sm")
# load stop words for Spanish
STOP_WORDS = nlp.Defaults.stop_words
# Function to filter stop words
def filter_stopwords(text):
# lower text
doc = nlp(text.lower())
# filter tokens
tokens = [token.text for token in doc if not token.is_stop and token.text not in STOP_WORDS and token.is_alpha]
return ' '.join(tokens)
# apply function to dataframe column
df['text_pre'] = df['text'].apply(filter_stopwords)
# count items on column
token_counts = df["text_pre"].str.split(expand=True).stack().value_counts()[:20]
token_counts
vida 2070
aborto 1097
colombia 719
sialavida 661
colombiaesprovida 437
mayo 390
q 388
noalaborto 370
eutanasia 323
derecho 323
gracias 309
provida 308
muerte 268
feliz 268
d 263
voz 250
mujer 222
familia 210
mujeres 204
concepción 191
Name: count, dtype: int64
Lista de las 10 horas con más cantidad de tweets publicados
hour
11 786
10 737
12 677
09 622
14 525
13 519
08 519
07 448
19 426
15 403
Name: count, dtype: int64
Plataformas desde las que se publicaron contenidos y su frecuencia
source_name
Twitter for iPhone 2031
Twitter Web App 1706
Twitter Web Client 1487
Facebook 1468
Twitter for Android 412
Mobile Web 163
TweetDeck 133
erased88075 131
Twitter for Websites 124
Instagram 99
UberSocial for iPhone 22
Mobile Web (M2) 12
iOS 11
Twitter for Android Tablets 10
Twitter for Mac 7
Tweeet! on iOS 4
Hootsuite Inc. 3
Buffer 3
Hootsuite 2
Twibbon 1
Periscope 1
Name: count, dtype: int64
Técnica de modelado de tópicos con transformers
y TF-IDF
# remove urls, mentions, hashtags and numbers
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.NUMBER)
df['text_pre'] = df['text_pre'].apply(lambda x: p.clean(x))
# replace emojis with descriptions
df['text_pre'] = df['text_pre'].apply(lambda x: demojize(x))
# filter column
docs = df['text_pre']
# calculate topics and probabilities
topic_model = BERTopic(language="multilingual", calculate_probabilities=True, verbose=True)
# training
topics, probs = topic_model.fit_transform(docs)
# visualize topics
topic_model.visualize_topics()
Mapa con el 20% del total de tópicos generados
Selección de tópicos que tocan temas de género
# selection of topics
topics = [1]
keywords_list = []
for topic_ in topics:
topic = topic_model.get_topic(topic_)
keywords = [x[0] for x in topic]
keywords_list.append(keywords)
# flatten list of lists
words_list = [item for sublist in keywords_list for item in sublist]
# use apply method with lambda function to filter rows
filtered_df = df[df['text_pre'].apply(lambda x: any(word in x for word in words_list))]
percentage = round(100 * len(filtered_df) / len(df), 2)
print(f"Del total de {len(df)} tweets de @UnidosxlaVidaCo, alrededor de {len(filtered_df)} hablan sobre temas de género, es decir, cerca del {percentage}%")
print(f"Lista de palabras en tópicos {topics}:\n{words_list}")
Del total de 7830 tweets de @UnidosxlaVidaCo, alrededor de 2750 hablan sobre temas de género, es decir, cerca del 35.12%
Lista de palabras en tópicos [1]:
['aborto', 'negocio', 'eutanasia', 'abortocero', 'mujeres', 'mujer', 'sialavida', 'abortonoesderecho', 'parenthood', 'apoyo']
# drop rows with 0 values in two columns
filtered_df = filtered_df[(filtered_df.like_count != 0) & (filtered_df.retweet_count != 0)]
# add a new column with the sum of two columns
filtered_df['impressions'] = (filtered_df['like_count'] + filtered_df['retweet_count'])/2
# extract year from datetime column
filtered_df['year'] = filtered_df['date'].dt.year
# remove urls, mentions, hashtags and numbers
p.set_options(p.OPT.URL)
filtered_df['tweet_text'] = filtered_df['text'].apply(lambda x: p.clean(x))
# Create scatter plot
fig = px.scatter(filtered_df, x='like_count',
y='retweet_count',
size='impressions',
color='year',
hover_name='tweet_text')
# Update title and axis labels
fig.update_layout(
title='Tweets talking about gender with most Likes and Retweets',
xaxis_title='Number of Likes',
yaxis_title='Number of Retweets'
)
fig.show()
# convert column to list
tweets = df['text_pre'].to_list()
timestamps = df['local_time'].to_list()
topics_over_time = topic_model.topics_over_time(docs=tweets,
timestamps=timestamps,
global_tuning=True,
evolution_tuning=True,
nr_bins=20)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)