Análisis de tweets de @MariaFdaCabal, @etorrescobo y @nikolas_dm
Code
# filter dataecu = data[data['user_screen_name'] =='etorrescobo']# convert time column to Brasilia, Brazil timezoneecu['date'] = ecu['date'].dt.tz_convert(pytz.timezone('America/Guayaquil'))# filter databra = data[data['user_screen_name'] =='nikolas_dm']# convert time column to Brasilia, Brazil timezonebra['date'] = bra['date'].dt.tz_convert(pytz.timezone('America/Sao_Paulo'))# filter datacol = data[data['user_screen_name'] =='MariaFdaCabal']# convert time column to Brasilia, Brazil timezonecol['date'] = col['date'].dt.tz_convert(pytz.timezone('America/Bogota'))# concatenate dataframessdf = pd.concat([ecu, col, bra], axis=0)print(len(df))
47811
Pre-procesamiento de texto es español
Code
# load the spacy model for Spanishnlp_es = spacy.load("es_core_news_sm")# load stop words for SpanishSTOP_WORDS_ES = nlp_es.Defaults.stop_words# Function to filter stop wordsdef filter_stopwords(text):# lower text doc = nlp_es(text.lower())# filter tokens tokens = [token.text for token in doc ifnot token.is_stop and token.text notin STOP_WORDS_ES and token.is_alpha]return' '.join(tokens)# apply function to dataframe columncol['text_pre'] = col['text'].apply(filter_stopwords)ecu['text_pre'] = ecu['text'].apply(filter_stopwords)
Pre-procesamiento de texto en portugués
Code
# load the spacy model for Portuguesenlp_pt = spacy.load("pt_core_news_sm")# load stop words for SpanishSTOP_WORDS_PT = nlp_pt.Defaults.stop_words# Function to filter stop wordsdef filter_stopwords(text):# lower text doc = nlp_pt(text.lower())# filter tokens tokens = [token.text for token in doc ifnot token.is_stop and token.text notin STOP_WORDS_PT and token.is_alpha]return' '.join(tokens)# apply function to dataframe columnbra['text_pre'] = bra['text'].apply(filter_stopwords)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Code
# reduce the number of topicstopic_model.reduce_topics(docs, nr_topics=31)# visualize topicstopic_model.visualize_topics()
Code
topic_model.visualize_barchart(top_n_topics=31)
Code
# selection of topicstopics = [13]keywords_list = []for topic_ in topics: topic = topic_model.get_topic(topic_) keywords = [x[0] for x in topic] keywords_list.append(keywords)# flatten list of listswords_list = [item for sublist in keywords_list for item in sublist]# use apply method with lambda function to filter rowsfiltered_col = col[col['text_pre'].apply(lambda x: any(word in x for word in words_list))]percentage =round(100*len(filtered_col) /len(col), 2)print(f"Del total de {len(col)} tweets de @MariaFdaCabal, alrededor de {len(filtered_col)} hablan sobre temas de género, es decir, cerca del {percentage}%")print(f"Lista de palabras en tópicos {topics}:\n{words_list}")
Del total de 32462 tweets de @MariaFdaCabal, alrededor de 753 hablan sobre temas de género, es decir, cerca del 2.32%
Lista de palabras en tópicos [13]:
['madrid', 'real', 'hala', 'barcelona', 'macron', 'vs', 'copa', 'europeos', 'francia', 'atlético']
# reduce the number of topicstopic_model.reduce_topics(docs, nr_topics=31)# visualize topicstopic_model.visualize_topics()
Code
topic_model.visualize_barchart(top_n_topics=31)
Code
# selection of topicstopics = [6]keywords_list = []for topic_ in topics: topic = topic_model.get_topic(topic_) keywords = [x[0] for x in topic] keywords_list.append(keywords)# flatten list of listswords_list = [item for sublist in keywords_list for item in sublist]# use apply method with lambda function to filter rowsfiltered_ecu = ecu[ecu['text_pre'].apply(lambda x: any(word in x for word in words_list))]percentage =round(100*len(filtered_ecu) /len(ecu), 2)print(f"Del total de {len(ecu)} tweets de @etorrescobo, alrededor de {len(filtered_ecu)} hablan sobre temas de género, es decir, cerca del {percentage}%")print(f"Lista de palabras en tópicos {topics}:\n{words_list}")
Del total de 8314 tweets de @etorrescobo, alrededor de 423 hablan sobre temas de género, es decir, cerca del 5.09%
Lista de palabras en tópicos [6]:
['aborto', 'niños', 'mujeres', 'violación', 'adolescentes', 'feminismo', 'despenalización', 'mujer', 'hijos', 'vida']
# reduce the number of topicstopic_model.reduce_topics(docs, nr_topics=31)topic_model.visualize_barchart(top_n_topics=31)
Code
# selection of topicstopics = [12]keywords_list = []for topic_ in topics: topic = topic_model.get_topic(topic_) keywords = [x[0] for x in topic] keywords_list.append(keywords)# flatten list of listswords_list = [item for sublist in keywords_list for item in sublist]# use apply method with lambda function to filter rowsfiltered_bra = bra[bra['text_pre'].apply(lambda x: any(word in x for word in words_list))]percentage =round(100*len(filtered_bra) /len(bra), 2)print(f"Del total de {len(bra)} tweets de @nikolas_dm, alrededor de {len(filtered_bra)} hablan sobre temas de género, es decir, cerca del {percentage}%")print(f"Lista de palabras en tópicos {topics}:\n{words_list}")
Del total de 7035 tweets de @nikolas_dm, alrededor de 222 hablan sobre temas de género, es decir, cerca del 3.16%
Lista de palabras en tópicos [12]:
['mulher', 'aborto', 'feminista', 'feminismo', 'feministas', 'mulheres', 'movimento', 'chega', 'homem', 'geralmente']
# drop rows with 0 values in two columnsfiltered_df = filtered_df[(filtered_df.like_count !=0) & (filtered_df.retweet_count !=0)]# add a new column with the sum of two columnsfiltered_df['impressions'] = (filtered_df['like_count'] + filtered_df['retweet_count'])/2# extract year from datetime columnfiltered_df['year'] = filtered_df['date'].dt.year# remove urls, mentions, hashtags and numbersp.set_options(p.OPT.URL)filtered_df['tweet_text'] = filtered_df['text'].apply(lambda x: p.clean(x))# Create scatter plotfig = px.scatter(filtered_df, x='like_count', y='retweet_count',# size='impressions', color='user_name', labels={"user_name": "Cuenta de Twitter"}, color_discrete_sequence=["#FD9432", "#5647E5", "#F666f8"], hover_name='tweet_text')# Update title and axis labelsfig.update_layout( title='Likes vs Retweets en tweets que hablan sobre temas de género', xaxis_title='Número de likes', yaxis_title='Número de retweets', plot_bgcolor="#eef4f5")fig.show()