Analyzing Word Length Distribution with PermutationImportance

2024-12-16 12:08:40 3 Views

Code introduction

This function uses Eli5's PermutationImportance to analyze the distribution of word lengths in a text and selects the top n most frequent word lengths.

Technology Stack : eli5, sklearn, CountVectorizer, MultinomialNB, PermutationImportance

Code Type : The type of code

Code Difficulty : Intermediate

                
                    
def random_word_length_distribution(text, n=10):
    import eli5
    from eli5.sklearn import PermutationImportance
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.model_selection import train_test_split
    from sklearn.naive_bayes import MultinomialNB
    from collections import Counter

    # Split text into words
    words = text.split()
    # Generate random word lengths
    random_lengths = [len(word) for word in words if len(word) <= 10]
    # Calculate the distribution of word lengths
    length_distribution = Counter(random_lengths)

    # Create a dataset
    X = [word for word in words if len(word) <= 10]
    y = [length_distribution[len(word)] for word in X]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Vectorize the words
    vectorizer = CountVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)

    # Train a Naive Bayes classifier
    clf = MultinomialNB()
    clf.fit(X_train_vectorized, y_train)

    # Use PermutationImportance to find the most important features
    permutation = PermutationImportance(clf, random_state=42).fit(X_train_vectorized, y_train)
    feature_importances = permutation.importances_mean_

    # Select the top n features
    top_n_features = sorted(range(len(feature_importances)), key=lambda x: feature_importances[x])[-n:]

    # Return the selected features and their importance
    selected_features = [vectorizer.get_feature_names()[i] for i in top_n_features]
    return selected_features, feature_importances[top_n_features]