Named Entity Recognition (NER) is the process of identifying and classifying key entities in text into predefined categories such as names of persons, organizations, locations, dates, and more.
NLTK provides a pre-trained named entity chunker using the ne_chunk
function, which recognizes common entities.
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
text = "Apple is looking at buying U.K. startup for $1 billion"
tokens = word_tokenize(text)
pos_tags = pos_tag(tokens)
print("Tokens:", tokens)
print("POS Tags:", pos_tags)
named_entities = ne_chunk(pos_tags)
print(named_entities)
for chunk in named_entities:
if hasattr(chunk, 'label'):
print(chunk.label(), ' '.join(c[0] for c in chunk))
entities = []
for chunk in named_entities:
if hasattr(chunk, 'label'):
entity = " ".join(c[0] for c in chunk)
label = chunk.label()
entities.append((entity, label))
print("Recognized Entities:", entities)
sentences = [
"Elon Musk founded SpaceX.",
"Google acquired YouTube for $1.65 billion."
]
for sent in sentences:
tokens = word_tokenize(sent)
pos_tags = pos_tag(tokens)
tree = ne_chunk(pos_tags)
for chunk in tree:
if hasattr(chunk, 'label'):
print(chunk.label(), ' '.join(c[0] for c in chunk))
document = "Barack Obama visited Facebook headquarters in California."
tokens = word_tokenize(document)
pos_tags = pos_tag(tokens)
tree = ne_chunk(pos_tags)
summary_entities = [ ' '.join(c[0] for c in chunk) for chunk in tree if hasattr(chunk, 'label') ]
summary = f"This document mentions: {', '.join(summary_entities)}"
print(summary)
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion.")
for ent in doc.ents:
print(ent.text, "->", ent.label_)
import spacy
from spacy import displacy
doc = nlp("Google was founded by Larry Page and Sergey Brin in California.")
displacy.render(doc, style="ent", jupyter=False) # use jupyter=True in notebooks
# This is just a placeholder.
# Training a custom NER model requires annotated data and pipeline setup.
# In spaCy:
# 1. Collect labeled data (e.g. in spaCy format or CoNLL).
# 2. Convert to spaCy DocBin format.
# 3. Use spacy train CLI to fine-tune.
# Example CLI command:
# !python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
doc = nlp("Amazon is opening a new office near the Amazon river.")
for ent in doc.ents:
print(ent.text, ent.label_)
# Optionally apply knowledge-based linking for disambiguation using spaCy's EntityLinker (via pipelines or external tools)