Análisis de tweets de @MariaFdaCabal, @etorrescobo y @nikolas_dm

Code
# filter data
ecu = data[data['user_screen_name'] == 'etorrescobo']

# convert time column to Brasilia, Brazil timezone
ecu['date'] = ecu['date'].dt.tz_convert(pytz.timezone('America/Guayaquil'))

# filter data
bra = data[data['user_screen_name'] == 'nikolas_dm']

# convert time column to Brasilia, Brazil timezone
bra['date'] = bra['date'].dt.tz_convert(pytz.timezone('America/Sao_Paulo'))

# filter data
col = data[data['user_screen_name'] == 'MariaFdaCabal']

# convert time column to Brasilia, Brazil timezone
col['date'] = col['date'].dt.tz_convert(pytz.timezone('America/Bogota'))

# concatenate dataframess
df = pd.concat([ecu, col, bra], axis=0)

print(len(df))
47811

Pre-procesamiento de texto es español

Code
# load the spacy model for Spanish
nlp_es = spacy.load("es_core_news_sm")

# load stop words for Spanish
STOP_WORDS_ES = nlp_es.Defaults.stop_words

# Function to filter stop words
def filter_stopwords(text):
    # lower text
    doc = nlp_es(text.lower())
    # filter tokens
    tokens = [token.text for token in doc if not token.is_stop and token.text not in STOP_WORDS_ES and token.is_alpha]
    return ' '.join(tokens)

# apply function to dataframe column
col['text_pre'] = col['text'].apply(filter_stopwords)
ecu['text_pre'] = ecu['text'].apply(filter_stopwords)

Pre-procesamiento de texto en portugués

Code
# load the spacy model for Portuguese
nlp_pt = spacy.load("pt_core_news_sm")

# load stop words for Spanish
STOP_WORDS_PT = nlp_pt.Defaults.stop_words

# Function to filter stop words
def filter_stopwords(text):
    # lower text
    doc = nlp_pt(text.lower())
    # filter tokens
    tokens = [token.text for token in doc if not token.is_stop and token.text not in STOP_WORDS_PT and token.is_alpha]
    return ' '.join(tokens)

# apply function to dataframe column
bra['text_pre'] = bra['text'].apply(filter_stopwords)

Tópicos en tweets de @MariaFdaCabal

Code
# remove urls, mentions, hashtags and numbers
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.NUMBER)
col['text_pre'] = col['text_pre'].apply(lambda x: p.clean(x))

# replace emojis with descriptions
col['text_pre'] = col['text_pre'].apply(lambda x: demojize(x))

# filter column
docs = col['text_pre']

# calculate topics and probabilities
topic_model = BERTopic(language="multilingual", calculate_probabilities=True, verbose=True)

# training
topics, probs = topic_model.fit_transform(docs)

# visualize topics
topic_model.visualize_topics()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Code
# reduce the number of topics
topic_model.reduce_topics(docs, nr_topics=31)

# visualize topics
topic_model.visualize_topics()
Code
topic_model.visualize_barchart(top_n_topics=31)
Code
# selection of topics
topics = [13]

keywords_list = []
for topic_ in topics:
    topic = topic_model.get_topic(topic_)
    keywords = [x[0] for x in topic]
    keywords_list.append(keywords)

# flatten list of lists
words_list = [item for sublist in keywords_list for item in sublist]

# use apply method with lambda function to filter rows
filtered_col = col[col['text_pre'].apply(lambda x: any(word in x for word in words_list))]

percentage = round(100 * len(filtered_col) / len(col), 2)
print(f"Del total de {len(col)} tweets de @MariaFdaCabal, alrededor de {len(filtered_col)} hablan sobre temas de género, es decir, cerca del {percentage}%")

print(f"Lista de palabras en tópicos {topics}:\n{words_list}")
Del total de 32462 tweets de @MariaFdaCabal, alrededor de 753 hablan sobre temas de género, es decir, cerca del 2.32%
Lista de palabras en tópicos [13]:
['madrid', 'real', 'hala', 'barcelona', 'macron', 'vs', 'copa', 'europeos', 'francia', 'atlético']

Tópicos en tweets de @etorrescobo

Code
# remove urls, mentions, hashtags and numbers
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.NUMBER)
ecu['text_pre'] = ecu['text_pre'].apply(lambda x: p.clean(x))

# replace emojis with descriptions
ecu['text_pre'] = ecu['text_pre'].apply(lambda x: demojize(x))

# filter column
docs = ecu['text_pre']

# calculate topics and probabilities
topic_model = BERTopic(language="multilingual", calculate_probabilities=True, verbose=True)

# training
topics, probs = topic_model.fit_transform(docs)

# visualize topics
topic_model.visualize_topics()
Code
# reduce the number of topics
topic_model.reduce_topics(docs, nr_topics=31)

# visualize topics
topic_model.visualize_topics()
Code
topic_model.visualize_barchart(top_n_topics=31)
Code
# selection of topics
topics = [6]

keywords_list = []
for topic_ in topics:
    topic = topic_model.get_topic(topic_)
    keywords = [x[0] for x in topic]
    keywords_list.append(keywords)

# flatten list of lists
words_list = [item for sublist in keywords_list for item in sublist]

# use apply method with lambda function to filter rows
filtered_ecu = ecu[ecu['text_pre'].apply(lambda x: any(word in x for word in words_list))]

percentage = round(100 * len(filtered_ecu) / len(ecu), 2)
print(f"Del total de {len(ecu)} tweets de @etorrescobo, alrededor de {len(filtered_ecu)} hablan sobre temas de género, es decir, cerca del {percentage}%")

print(f"Lista de palabras en tópicos {topics}:\n{words_list}")
Del total de 8314 tweets de @etorrescobo, alrededor de 423 hablan sobre temas de género, es decir, cerca del 5.09%
Lista de palabras en tópicos [6]:
['aborto', 'niños', 'mujeres', 'violación', 'adolescentes', 'feminismo', 'despenalización', 'mujer', 'hijos', 'vida']

Tópicos en tweets de @nikolas_dm

Code
# remove urls, mentions, hashtags and numbers
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.NUMBER)
bra['text_pre'] = bra['text_pre'].apply(lambda x: p.clean(x))

# replace emojis with descriptions
bra['text_pre'] = bra['text_pre'].apply(lambda x: demojize(x))

# filter column
docs = bra['text_pre']

# calculate topics and probabilities
topic_model = BERTopic(language="multilingual", calculate_probabilities=True, verbose=True)

# training
topics, probs = topic_model.fit_transform(docs)

# visualize topics
topic_model.visualize_topics()
Code
# reduce the number of topics
topic_model.reduce_topics(docs, nr_topics=31)

topic_model.visualize_barchart(top_n_topics=31)
Code
# selection of topics
topics = [12]

keywords_list = []
for topic_ in topics:
    topic = topic_model.get_topic(topic_)
    keywords = [x[0] for x in topic]
    keywords_list.append(keywords)

# flatten list of lists
words_list = [item for sublist in keywords_list for item in sublist]

# use apply method with lambda function to filter rows
filtered_bra = bra[bra['text_pre'].apply(lambda x: any(word in x for word in words_list))]

percentage = round(100 * len(filtered_bra) / len(bra), 2)
print(f"Del total de {len(bra)} tweets de @nikolas_dm, alrededor de {len(filtered_bra)} hablan sobre temas de género, es decir, cerca del {percentage}%")

print(f"Lista de palabras en tópicos {topics}:\n{words_list}")
Del total de 7035 tweets de @nikolas_dm, alrededor de 222 hablan sobre temas de género, es decir, cerca del 3.16%
Lista de palabras en tópicos [12]:
['mulher', 'aborto', 'feminista', 'feminismo', 'feministas', 'mulheres', 'movimento', 'chega', 'homem', 'geralmente']

Visualización

Code
# concatenate dataframess
filtered_df = pd.concat([filtered_col, filtered_ecu, filtered_bra], axis=0)

# replace 'T' from column
filtered_df['date'] = filtered_df['local_time'].str.replace('T', ' ')

filtered_df['date'] = pd.to_datetime(filtered_df['date'], format='%Y-%m-%d %H:%M:%S')
Code
# drop rows with 0 values in two columns
filtered_df = filtered_df[(filtered_df.like_count != 0) & (filtered_df.retweet_count != 0)]

# add a new column with the sum of two columns
filtered_df['impressions'] = (filtered_df['like_count'] + filtered_df['retweet_count'])/2

# extract year from datetime column
filtered_df['year'] = filtered_df['date'].dt.year

# remove urls, mentions, hashtags and numbers
p.set_options(p.OPT.URL)
filtered_df['tweet_text'] = filtered_df['text'].apply(lambda x: p.clean(x))

# Create scatter plot
fig = px.scatter(filtered_df, x='like_count', 
                 y='retweet_count',
                #  size='impressions', 
                 color='user_name',
                 labels={"user_name": "Cuenta de Twitter"},
                 color_discrete_sequence=["#FD9432", "#5647E5", "#F666f8"],
                 hover_name='tweet_text')

# Update title and axis labels
fig.update_layout(
    title='Likes vs Retweets en tweets que hablan sobre temas de género',
    xaxis_title='Número de likes',
    yaxis_title='Número de retweets',
    plot_bgcolor="#eef4f5"
)

fig.show()