You can download this code by clicking the button below.
This code is now available for download.
The function uses the LdaMulticore model from the gensim library to perform topic modeling on a set of texts, returning the keywords for each topic.
Technology Stack : gensim, corpora, models, simple_preprocess, collections.defaultdict, nltk.stem.wordnet, nltk.corpus.stopwords
Code Type : Text analysis
Code Difficulty : Intermediate
def topic_modeling(texts, num_topics=10, num_words=5):
from gensim import corpora, models
from gensim.utils import simple_preprocess
from collections import defaultdict
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
# Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words_set = set(stopwords.words('english'))
# Preprocess the texts
processed_texts = [[lemmatizer.lemmatize(word) for word in simple_preprocess(text) if word not in stop_words_set] for text in texts]
# Create the dictionary and corpus
dictionary = corpora.Dictionary(processed_texts)
corpus = [dictionary.doc2bow(text) for text in processed_texts]
# Train the LDA model
lda_model = models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=10, workers=2)
# Extract topics and words
topics = lda_model.print_topics(num_words=num_words)
return topics