Sentence tokenization is the process of splitting a paragraph or text into individual sentences. It is a foundational step in NLP tasks like parsing, sentiment analysis, and machine translation.
import nltk
from nltk.tokenize import sent_tokenize
text = "Hello world! This is an example. NLP is fun. Let's tokenize sentences."
# Example 1: Basic sentence tokenization
sentences = sent_tokenize(text)
print(sentences)
# Example 2: Tokenize a paragraph with abbreviations
text2 = "Dr. Smith went to Washington. He arrived at 3 p.m. It was sunny."
print(sent_tokenize(text2))
# Example 3: Using PunktSentenceTokenizer for training
from nltk.tokenize import PunktSentenceTokenizer
custom_tokenizer = PunktSentenceTokenizer(text2)
print(custom_tokenizer.tokenize(text2))
# Example 4: Using SpaCy for sentence tokenization
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
print([sent.text for sent in doc.sents])
# Example 5: Sentence tokenization with regex (basic)
import re
sentences_regex = re.split(r'(?<=[.!?]) +', text)
print(sentences_regex)
# Example 6: Tokenizing sentences with newline characters
text3 = "Hello world!\nHow are you?\nI am fine."
print(sent_tokenize(text3))
# Example 7: Tokenize complex sentences with quotes
text4 = 'He said, "NLP is amazing!" Did you hear that?'
print(sent_tokenize(text4))
# Example 8: Count sentences
print(f"Number of sentences: {len(sentences)}")
# Example 9: Tokenize sentences and strip whitespace
sentences_stripped = [s.strip() for s in sentences]
print(sentences_stripped)
# Example 10: Sentence tokenization in other languages (e.g., German)
text_de = "Hallo Welt! Dies ist ein Beispiel."
print(sent_tokenize(text_de, language='german'))