You can download this code by clicking the button below.
This code is now available for download.
This function uses Eli5's PermutationImportance to analyze the distribution of word lengths in a text and selects the top n most frequent word lengths.
Technology Stack : eli5, sklearn, CountVectorizer, MultinomialNB, PermutationImportance
Code Type : The type of code
Code Difficulty : Intermediate
def random_word_length_distribution(text, n=10):
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
# Split text into words
words = text.split()
# Generate random word lengths
random_lengths = [len(word) for word in words if len(word) <= 10]
# Calculate the distribution of word lengths
length_distribution = Counter(random_lengths)
# Create a dataset
X = [word for word in words if len(word) <= 10]
y = [length_distribution[len(word)] for word in X]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Vectorize the words
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
# Train a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_vectorized, y_train)
# Use PermutationImportance to find the most important features
permutation = PermutationImportance(clf, random_state=42).fit(X_train_vectorized, y_train)
feature_importances = permutation.importances_mean_
# Select the top n features
top_n_features = sorted(range(len(feature_importances)), key=lambda x: feature_importances[x])[-n:]
# Return the selected features and their importance
selected_features = [vectorizer.get_feature_names()[i] for i in top_n_features]
return selected_features, feature_importances[top_n_features]