Text cleansing (or text preprocessing) is a crucial step in NLP to prepare raw text data for analysis. It involves removing noise and normalizing the text to improve the quality and accuracy of downstream tasks like tokenization, classification, or sentiment analysis.
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
text = "Hello!!! This is an example: NLP @2024 with #Python. Let's clean it."
# Example 1: Lowercase
text_lower = text.lower()
print(text_lower)
# Example 2: Remove punctuation
text_no_punct = re.sub(r'[^\w\s]', '', text_lower)
print(text_no_punct)
# Example 3: Remove numbers
text_no_numbers = re.sub(r'\d+', '', text_no_punct)
print(text_no_numbers)
# Example 4: Remove extra whitespace
text_clean = " ".join(text_no_numbers.split())
print(text_clean)
# Example 5: Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in text_clean.split() if w not in stop_words]
print(filtered_words)
# Example 6: Remove special characters (like hashtags)
text_no_special = re.sub(r'[#@]', '', text)
print(text_no_special)
# Example 7: Stemming
ps = PorterStemmer()
stemmed = [ps.stem(w) for w in filtered_words]
print(stemmed)
# Example 8: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w) for w in filtered_words]
print(lemmatized)
# Example 9: Spell correction (using textblob)
from textblob import TextBlob
corrected = str(TextBlob(text).correct())
print(corrected)
# Example 10: Remove HTML tags (if present)
html_text = "Hello world!
"
clean_html = re.sub(r'<.*?>', '', html_text)
print(clean_html)