Random Sentence and Word Analysis with NLTK

  • Share this:

Code introduction


This code defines a function that takes a text input and performs a random sentence and word analysis using the NLTK library, including the number of words in the sentence after removing stopwords and the number of unique words.


Technology Stack : The code uses the NLTK library for text processing, including sentence tokenization, word tokenization, stopword removal, and lemmatization.

Code Type : The type of code

Code Difficulty : Intermediate


                
                    
import random
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def random_sentence_analysis(text):
    """
    This function takes a text and performs random sentence and word analysis using NLTK.
    """
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    # Select a random sentence
    random_sentence = random.choice(sentences)
    # Tokenize the random sentence into words
    words = word_tokenize(random_sentence)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    # Return the analysis
    return {
        'sentence': random_sentence,
        'word_count': len(lemmatized_words),
        'unique_words': len(set(lemmatized_words))
    }

# Example usage
text_example = "NLTK is a leading platform for building Python programs to work with human language data."
analysis = random_sentence_analysis(text_example)
print(analysis)