Python – 带有示例的词形还原方法
以下是探索Python中各种 Lemmatization 方法的分步指南以及一些示例和代码实现。除非您了解该主题,否则强烈建议您坚持给定的流程,在这种情况下,您可以查找下面给出的任何方法。
什么是词形还原?
与词干提取相比,词形还原要强大得多。它超越了单词缩减,并考虑了一种语言的完整词汇表以对单词进行形态分析,旨在仅删除屈折词尾并返回单词的基本形式或字典形式,这被称为引理。
为清楚起见,请查看下面给出的以下示例:
Original Word ---> Root Word (lemma) Feature
meeting ---> meet (core-word extraction)
was ---> be (tense conversion to present tense)
mice ---> mouse (plural to singular)
TIP: Always convert your text to lowercase before performing any NLP task including lemmatizing.
词形还原的各种方法:
我们将介绍9 种不同的方法来执行词形还原以及多个示例和代码实现。
- 词网
- WordNet(带 POS 标签)
- 文本块
- TextBlob(带有 POS 标签)
- 斯帕西
- 树标记器
- 图案
- 根西姆
- 斯坦福 CoreNLP
1. Wordnet Lemmatizer
Wordnet 是一个公开可用的词汇数据库,包含 200 多种语言,可提供其单词之间的语义关系。它是最早和最常用的词形还原技术之一。
- 它存在于Python的nltk 库中。
- Wordnet 将单词链接到语义关系中。 (例如同义词)
- 它以 synsets 的形式对同义词进行分组。
- synsets :一组语义等价的数据元素。
如何使用:
- 下载 nltk 包:在您的 anaconda 提示符或终端中,键入:
点安装 nltk - 从 nltk 下载 Wordnet :在您的Python控制台中,执行以下操作:
导入 nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
代码:
Python3
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
# Create WordNetLemmatizer object
wnl = WordNetLemmatizer()
# single word lemmatization examples
list1 = ['kites', 'babies', 'dogs', 'flying', 'smiling',
'driving', 'died', 'tried', 'feet']
for words in list1:
print(words + " ---> " + wnl.lemmatize(words))
#> kites ---> kite
#> babies ---> baby
#> dogs ---> dog
#> flying ---> flying
#> smiling ---> smiling
#> driving ---> driving
#> died ---> died
#> tried ---> tried
#> feet ---> foot
Python3
# sentence lemmatization examples
string = 'the cat is sitting with the bats on the striped mat under many flying geese'
# Converting String into tokens
list2 = nltk.word_tokenize(string)
print(list2)
#> ['the', 'cat', 'is', 'sitting', 'with', 'the', 'bats', 'on',
# 'the', 'striped', 'mat', 'under', 'many', 'flying', 'geese']
lemmatized_string = ' '.join([wnl.lemmatize(words) for words in list2])
print(lemmatized_string)
#> the cat is sitting with the bat on the striped mat under many flying goose
Python3
# WORDNET LEMMATIZER (with appropriate pos tags)
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()
# Define function to lemmatize each word with its POS tag
# POS_TAGGER_FUNCTION : TYPE 1
def pos_tagger(nltk_tag):
if nltk_tag.startswith('J'):
return wordnet.ADJ
elif nltk_tag.startswith('V'):
return wordnet.VERB
elif nltk_tag.startswith('N'):
return wordnet.NOUN
elif nltk_tag.startswith('R'):
return wordnet.ADV
else:
return None
sentence = 'the cat is sitting with the bats on the striped mat under many badly flying geese'
# tokenize the sentence and find the POS tag for each token
pos_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
print(pos_tagged)
#>[('the', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('sitting', 'VBG'), ('with', 'IN'),
# ('the', 'DT'), ('bats', 'NNS'), ('on', 'IN'), ('the', 'DT'), ('striped', 'JJ'),
# ('mat', 'NN'), ('under', 'IN'), ('many', 'JJ'), ('flying', 'VBG'), ('geese', 'JJ')]
# As you may have noticed, the above pos tags are a little confusing.
# we use our own pos_tagger function to make things simpler to understand.
wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
print(wordnet_tagged)
#>[('the', None), ('cat', 'n'), ('is', 'v'), ('sitting', 'v'), ('with', None),
# ('the', None), ('bats', 'n'), ('on', None), ('the', None), ('striped', 'a'),
# ('mat', 'n'), ('under', None), ('many', 'a'), ('flying', 'v'), ('geese', 'a')]
lemmatized_sentence = []
for word, tag in wordnet_tagged:
if tag is None:
# if there is no available tag, append the token as is
lemmatized_sentence.append(word)
else:
# else use the tag to lemmatize the token
lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
lemmatized_sentence = " ".join(lemmatized_sentence)
print(lemmatized_sentence)
#> the cat can be sit with the bat on the striped mat under many fly geese
Python3
from textblob import TextBlob, Word
my_word = 'cats'
# create a Word object
w = Word(my_word)
print(w.lemmatize())
#> cat
sentence = 'the bats saw the cats with stripes hanging upside down by their feet.'
s = TextBlob(sentence)
lemmatized_sentence = " ".join([w.lemmatize() for w in s.words])
print(lemmatized_sentence)
#> the bat saw the cat with stripe hanging upside down by their foot
Python3
from textblob import TextBlob
# Define function to lemmatize each word with its POS tag
# POS_TAGGER_FUNCTION : TYPE 2
def pos_tagger(sentence):
sent = TextBlob(sentence)
tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
words_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]
lemma_list = [wd.lemmatize(tag) for wd, tag in words_tags]
return lemma_list
# Lemmatize
sentence = "the bats saw the cats with stripes hanging upside down by their feet"
lemma_list = pos_tagger(sentence)
lemmatized_sentence = " ".join(lemma_list)
print(lemmatized_sentence)
#> the bat saw the cat with stripe hang upside down by their foot
lemmatized_sentence = " ".join([w.lemmatize() for w in t_blob.words])
print(lemmatized_sentence)
#> the bat saw the cat with stripe hanging upside down by their foot
Python3
import spacy
nlp = spacy.load('en_core_web_sm')
# Create a Doc object
doc = nlp(u'the bats saw the cats with best stripes hanging upside down by their feet')
# Create list of tokens from given string
tokens = []
for token in doc:
tokens.append(token)
print(tokens)
#> [the, bats, saw, the, cats, with, best, stripes, hanging, upside, down, by, their, feet]
lemmatized_sentence = " ".join([token.lemma_ for token in doc])
print(lemmatized_sentence)
#> the bat see the cat with good stripe hang upside down by -PRON- foot
Python3
# 6. TREETAGGER LEMMATIZER
import pandas as pd
import treetaggerwrapper as tt
t_tagger = tt.TreeTagger(TAGLANG ='en', TAGDIR ='C:\Windows\TreeTagger')
pos_tags = t_tagger.tag_text("the bats saw the cats with best stripes hanging upside down by their feet")
original = []
lemmas = []
tags = []
for t in pos_tags:
original.append(t.split('\t')[0])
tags.append(t.split('\t')[1])
lemmas.append(t.split('\t')[-1])
Results = pd.DataFrame({'Original': original, 'Lemma': lemmas, 'Tags': tags})
print(Results)
#> Original Lemma Tags
# 0 the the DT
# 1 bats bat NNS
# 2 saw see VVD
# 3 the the DT
# 4 cats cat NNS
# 5 with with IN
# 6 best good JJS
# 7 stripes stripe NNS
# 8 hanging hang VVG
# 9 upside upside RB
# 10 down down RB
# 11 by by IN
# 12 their their PP$
# 13 feet foot NNS
Python3
# PATTERN LEMMATIZER
import pattern
from pattern.en import lemma, lexeme
from pattern.en import parse
sentence = "the bats saw the cats with best stripes hanging upside down by their feet"
lemmatized_sentence = " ".join([lemma(word) for word in sentence.split()])
print(lemmatized_sentence)
#> the bat see the cat with best stripe hang upside down by their feet
# Special Feature : to get all possible lemmas for each word in the sentence
all_lemmas_for_each_word = [lexeme(wd) for wd in sentence.split()]
print(all_lemmas_for_each_word)
#> [['the', 'thes', 'thing', 'thed'],
# ['bat', 'bats', 'batting', 'batted'],
# ['see', 'sees', 'seeing', 'saw', 'seen'],
# ['the', 'thes', 'thing', 'thed'],
# ['cat', 'cats', 'catting', 'catted'],
# ['with', 'withs', 'withing', 'withed'],
# ['best', 'bests', 'besting', 'bested'],
# ['stripe', 'stripes', 'striping', 'striped'],
# ['hang', 'hangs', 'hanging', 'hung'],
# ['upside', 'upsides', 'upsiding', 'upsided'],
# ['down', 'downs', 'downing', 'downed'],
# ['by', 'bies', 'bying', 'bied'],
# ['their', 'theirs', 'theiring', 'theired'],
# ['feet', 'feets', 'feeting', 'feeted']]
Python3
from gensim.utils import lemmatize
sentence = "the bats saw the cats with best stripes hanging upside down by their feet"
lemmatized_sentence = [word.decode('utf-8').split('.')[0] for word in lemmatize(sentence)]
print(lemmatized_sentence)
#> ['bat / NN', 'see / VB', 'cat / NN', 'best / JJ',
# 'stripe / NN', 'hang / VB', 'upside / RB', 'foot / NN']
Python3
from stanfordcorenlp import StanfordCoreNLP
import json
# Connect to the CoreNLP server we just started
nlp = StanfordCoreNLP('http://localhost', port = 9000, timeout = 30000)
# Define properties needed to get lemma
props = {'annotators': 'pos, lemma', 'pipelineLanguage': 'en', 'outputFormat': 'json'}
sentence = "the bats saw the cats with best stripes hanging upside down by their feet"
parsed_str = nlp.annotate(sentence, properties = props)
print(parsed_str)
#> "sentences": [{"index": 0,
# "tokens": [
# {
# "index": 1,
# "word": "the",
# "originalText": "the",
# "lemma": "the", <--------------- LEMMA
# "characterOffsetBegin": 0,
# "characterOffsetEnd": 3,
# "pos": "DT",
# "before": "",
# "after": " "
# },
# {
# "index": 2,
# "word": "bats",
# "originalText": "bats",
# "lemma": "bat", <--------------- LEMMA
# "characterOffsetBegin": 4,
# "characterOffsetEnd": 8,
# "pos": "NNS",
# "before": " ",
# "after": " "
# },
# {
# "index": 3,
# "word": "saw",
# "originalText": "saw",
# "lemma": "see", <--------------- LEMMA
# "characterOffsetBegin": 9,
# "characterOffsetEnd": 12,
# "pos": "VBD",
# "before": " ",
# "after": " "
# },
# {
# "index": 4,
# "word": "the",
# "originalText": "the",
# "lemma": "the", <--------------- LEMMA
# "characterOffsetBegin": 13,
# "characterOffsetEnd": 16,
# "pos": "DT",
# "before": " ",
# "after": " "
# },
# {
# "index": 5,
# "word": "cats",
# "originalText": "cats",
# "lemma": "cat", <--------------- LEMMA
# "characterOffsetBegin": 17,
# "characterOffsetEnd": 21,
# "pos": "NNS",
# "before": " ",
# "after": " "
# },
# {
# "index": 6,
# "word": "with",
# "originalText": "with",
# "lemma": "with", <--------------- LEMMA
# "characterOffsetBegin": 22,
# "characterOffsetEnd": 26,
# "pos": "IN",
# "before": " ",
# "after": " "
# },
# {
# "index": 7,
# "word": "best",
# "originalText": "best",
# "lemma": "best", <--------------- LEMMA
# "characterOffsetBegin": 27,
# "characterOffsetEnd": 31,
# "pos": "JJS",
# "before": " ",
# "after": " "
# },
# {
# "index": 8,
# "word": "stripes",
# "originalText": "stripes",
# "lemma": "stripe", <--------------- LEMMA
# "characterOffsetBegin": 32,
# "characterOffsetEnd": 39,
# "pos": "NNS",
# "before": " ",
# "after": " "
# },
# {
# "index": 9,
# "word": "hanging",
# "originalText": "hanging",
# "lemma": "hang", <--------------- LEMMA
# "characterOffsetBegin": 40,
# "characterOffsetEnd": 47,
# "pos": "VBG",
# "before": " ",
# "after": " "
# },
# {
# "index": 10,
# "word": "upside",
# "originalText": "upside",
# "lemma": "upside", <--------------- LEMMA
# "characterOffsetBegin": 48,
# "characterOffsetEnd": 54,
# "pos": "RB",
# "before": " ",
# "after": " "
# },
# {
# "index": 11,
# "word": "down",
# "originalText": "down",
# "lemma": "down", <--------------- LEMMA
# "characterOffsetBegin": 55,
# "characterOffsetEnd": 59,
# "pos": "RB",
# "before": " ",
# "after": " "
# },
# {
# "index": 12,
# "word": "by",
# "originalText": "by",
# "lemma": "by", <--------------- LEMMA
# "characterOffsetBegin": 60,
# "characterOffsetEnd": 62,
# "pos": "IN",
# "before": " ",
# "after": " "
# },
# {
# "index": 13,
# "word": "their",
# "originalText": "their",
# "lemma": "they"#, <--------------- LEMMA
# "characterOffsetBegin": 63,
# "characterOffsetEnd": 68,
# "pos": "PRP$",
# "before": " ",
# "after": " "
# },
# {
# "index": 14,
# "word": "feet",
# "originalText": "feet",
# "lemma": "foot", <--------------- LEMMA
# "characterOffsetBegin": 69,
# "characterOffsetEnd": 73,
# "pos": "NNS",
# "before": " ",
# "after": ""
# }
# ]
# }
# ]
Python3
# To get the lemmatized sentence as output
# ** RUN THE ABOVE SCRIPT FIRST **
lemma_list = []
for item in parsed_dict['sentences'][0]['tokens']:
for key, value in item.items():
if key == 'lemma':
lemma_list.append(value)
print(lemma_list)
#> ['the', 'bat', 'see', 'the', 'cat', 'with', 'best', 'stripe', 'hang', 'upside', 'down', 'by', 'they', 'foot']
lemmatized_sentence = " ".join(lemma_list)
print(lemmatized_sentence)
#>the bat see the cat with best stripe hang upside down by the foot
代码:
Python3
# sentence lemmatization examples
string = 'the cat is sitting with the bats on the striped mat under many flying geese'
# Converting String into tokens
list2 = nltk.word_tokenize(string)
print(list2)
#> ['the', 'cat', 'is', 'sitting', 'with', 'the', 'bats', 'on',
# 'the', 'striped', 'mat', 'under', 'many', 'flying', 'geese']
lemmatized_string = ' '.join([wnl.lemmatize(words) for words in list2])
print(lemmatized_string)
#> the cat is sitting with the bat on the striped mat under many flying goose
2. Wordnet Lemmatizer(带 POS 标签)
在上述方法中,我们观察到 Wordnet 结果不达标。词形还原后,“坐”、“飞”等词保持不变。这是因为这些词在给定句子中被视为名词而不是动词。为了克服这个问题,我们使用 POS(词性)标签。
我们添加一个带有定义其类型(动词、名词、形容词等)的特定单词的标签。
例如,
Word + Type(POS标签) -> Lemmatized Word
驾驶+动词'v'->驱动
狗 + 名词 'n' —> 狗
代码:
Python3
# WORDNET LEMMATIZER (with appropriate pos tags)
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()
# Define function to lemmatize each word with its POS tag
# POS_TAGGER_FUNCTION : TYPE 1
def pos_tagger(nltk_tag):
if nltk_tag.startswith('J'):
return wordnet.ADJ
elif nltk_tag.startswith('V'):
return wordnet.VERB
elif nltk_tag.startswith('N'):
return wordnet.NOUN
elif nltk_tag.startswith('R'):
return wordnet.ADV
else:
return None
sentence = 'the cat is sitting with the bats on the striped mat under many badly flying geese'
# tokenize the sentence and find the POS tag for each token
pos_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
print(pos_tagged)
#>[('the', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('sitting', 'VBG'), ('with', 'IN'),
# ('the', 'DT'), ('bats', 'NNS'), ('on', 'IN'), ('the', 'DT'), ('striped', 'JJ'),
# ('mat', 'NN'), ('under', 'IN'), ('many', 'JJ'), ('flying', 'VBG'), ('geese', 'JJ')]
# As you may have noticed, the above pos tags are a little confusing.
# we use our own pos_tagger function to make things simpler to understand.
wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
print(wordnet_tagged)
#>[('the', None), ('cat', 'n'), ('is', 'v'), ('sitting', 'v'), ('with', None),
# ('the', None), ('bats', 'n'), ('on', None), ('the', None), ('striped', 'a'),
# ('mat', 'n'), ('under', None), ('many', 'a'), ('flying', 'v'), ('geese', 'a')]
lemmatized_sentence = []
for word, tag in wordnet_tagged:
if tag is None:
# if there is no available tag, append the token as is
lemmatized_sentence.append(word)
else:
# else use the tag to lemmatize the token
lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
lemmatized_sentence = " ".join(lemmatized_sentence)
print(lemmatized_sentence)
#> the cat can be sit with the bat on the striped mat under many fly geese
3. 文本块
TextBlob 是一个用于处理文本数据的Python库。它提供了一个简单的 API 来访问其方法并执行基本的 NLP 任务。
下载 TextBlob 包:在您的 anaconda 提示符或终端中,键入:
点安装文本块
代码:
Python3
from textblob import TextBlob, Word
my_word = 'cats'
# create a Word object
w = Word(my_word)
print(w.lemmatize())
#> cat
sentence = 'the bats saw the cats with stripes hanging upside down by their feet.'
s = TextBlob(sentence)
lemmatized_sentence = " ".join([w.lemmatize() for w in s.words])
print(lemmatized_sentence)
#> the bat saw the cat with stripe hanging upside down by their foot
4. TextBlob(带 POS 标签)
与不使用适当 POS 标签的 Wordnet 方法相同,我们在此方法中也观察到相同的限制。因此,我们使用 TextBlob 模块更强大的方面之一,即“词性”标记来克服这个问题。
代码:
Python3
from textblob import TextBlob
# Define function to lemmatize each word with its POS tag
# POS_TAGGER_FUNCTION : TYPE 2
def pos_tagger(sentence):
sent = TextBlob(sentence)
tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
words_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]
lemma_list = [wd.lemmatize(tag) for wd, tag in words_tags]
return lemma_list
# Lemmatize
sentence = "the bats saw the cats with stripes hanging upside down by their feet"
lemma_list = pos_tagger(sentence)
lemmatized_sentence = " ".join(lemma_list)
print(lemmatized_sentence)
#> the bat saw the cat with stripe hang upside down by their foot
lemmatized_sentence = " ".join([w.lemmatize() for w in t_blob.words])
print(lemmatized_sentence)
#> the bat saw the cat with stripe hanging upside down by their foot
Here is a link for all the types of tag abbreviations with their meanings. (scroll down for the tags table)
5.斯帕西
spaCy 是一个开源Python库,可以解析和“理解”大量文本。可提供适用于特定语言(英语、法语、德语等)的单独模型。
Download spaCy package :(a) Open anaconda prompt or terminal as administrator and run the command:
(b) Now, open anaconda prompt or terminal normally and run the command:
If successful, you should see a message like:
Linking successful
C:\Anaconda3\envs\spacyenv\lib\site-packages\en_core_web_sm -->
C:\Anaconda3\envs\spacyenv\lib\site-packages\spacy\data\en
You can now load the model via
代码:
Python3
import spacy
nlp = spacy.load('en_core_web_sm')
# Create a Doc object
doc = nlp(u'the bats saw the cats with best stripes hanging upside down by their feet')
# Create list of tokens from given string
tokens = []
for token in doc:
tokens.append(token)
print(tokens)
#> [the, bats, saw, the, cats, with, best, stripes, hanging, upside, down, by, their, feet]
lemmatized_sentence = " ".join([token.lemma_ for token in doc])
print(lemmatized_sentence)
#> the bat see the cat with good stripe hang upside down by -PRON- foot
在上面的代码中,我们观察到这种方法比我们以前的方法更强大:
- 甚至检测到了代名词。 (由-PRON-标识)
- 即使是最好的也变成了好。
6. 树标记器
TreeTagger 是一种使用词性和词条信息注释文本的工具。 TreeTagger 已成功用于标记超过 25 种语言,如果有手动标记的训练语料库,它还可以适应其他语言。 Word POS Lemma the DT the TreeTagger NP TreeTagger is VBZ be easy JJ easy to TO to use VB use . SENT .
How to use:
1. Download TreeTagger package : In your anaconda prompt or terminal, type:
2. Download TreeTagger Software: Click on TreeTagger and download the software as per your OS.
(Steps of installation given on website)
代码:
Python3
# 6. TREETAGGER LEMMATIZER
import pandas as pd
import treetaggerwrapper as tt
t_tagger = tt.TreeTagger(TAGLANG ='en', TAGDIR ='C:\Windows\TreeTagger')
pos_tags = t_tagger.tag_text("the bats saw the cats with best stripes hanging upside down by their feet")
original = []
lemmas = []
tags = []
for t in pos_tags:
original.append(t.split('\t')[0])
tags.append(t.split('\t')[1])
lemmas.append(t.split('\t')[-1])
Results = pd.DataFrame({'Original': original, 'Lemma': lemmas, 'Tags': tags})
print(Results)
#> Original Lemma Tags
# 0 the the DT
# 1 bats bat NNS
# 2 saw see VVD
# 3 the the DT
# 4 cats cat NNS
# 5 with with IN
# 6 best good JJS
# 7 stripes stripe NNS
# 8 hanging hang VVG
# 9 upside upside RB
# 10 down down RB
# 11 by by IN
# 12 their their PP$
# 13 feet foot NNS
7.图案
Pattern 是一个Python包,常用于 Web 挖掘、自然语言处理、机器学习和网络分析。它具有许多有用的 NLP 功能。它还包含一个特殊功能,我们将在下面讨论。
How to use:
Download Pattern package: In your anaconda prompt or terminal, type:
代码:
Python3
# PATTERN LEMMATIZER
import pattern
from pattern.en import lemma, lexeme
from pattern.en import parse
sentence = "the bats saw the cats with best stripes hanging upside down by their feet"
lemmatized_sentence = " ".join([lemma(word) for word in sentence.split()])
print(lemmatized_sentence)
#> the bat see the cat with best stripe hang upside down by their feet
# Special Feature : to get all possible lemmas for each word in the sentence
all_lemmas_for_each_word = [lexeme(wd) for wd in sentence.split()]
print(all_lemmas_for_each_word)
#> [['the', 'thes', 'thing', 'thed'],
# ['bat', 'bats', 'batting', 'batted'],
# ['see', 'sees', 'seeing', 'saw', 'seen'],
# ['the', 'thes', 'thing', 'thed'],
# ['cat', 'cats', 'catting', 'catted'],
# ['with', 'withs', 'withing', 'withed'],
# ['best', 'bests', 'besting', 'bested'],
# ['stripe', 'stripes', 'striping', 'striped'],
# ['hang', 'hangs', 'hanging', 'hung'],
# ['upside', 'upsides', 'upsiding', 'upsided'],
# ['down', 'downs', 'downing', 'downed'],
# ['by', 'bies', 'bying', 'bied'],
# ['their', 'theirs', 'theiring', 'theired'],
# ['feet', 'feets', 'feeting', 'feeted']]
NOTE : if the above code raises an error saying ‘generator raised StopIteration’. Just run it again. It will work after 3-4 tries.
8. 根西姆
Gensim 旨在使用数据流处理大型文本集合。它的词形还原工具基于我们上面安装的模式包。
- gensim.utils.lemmatize()函数可用于执行词形还原。此方法位于Python中的 utils 模块下。
- 我们可以使用这个来自模式的词形还原器来提取UTF8 编码的标记,它们的基本形式 = lemma。
- 默认情况下只考虑名词、动词、形容词和副词(所有其他引理都被丢弃)。
- 例如
Word ---> Lemmatized Word
are/is/being ---> be
saw ---> see
How to use:
1. Download Pattern package: In your anaconda prompt or terminal, type:
2. Download Gensim package: Open your anaconda prompt or terminal as administrator and type:
OR
代码:
Python3
from gensim.utils import lemmatize
sentence = "the bats saw the cats with best stripes hanging upside down by their feet"
lemmatized_sentence = [word.decode('utf-8').split('.')[0] for word in lemmatize(sentence)]
print(lemmatized_sentence)
#> ['bat / NN', 'see / VB', 'cat / NN', 'best / JJ',
# 'stripe / NN', 'hang / VB', 'upside / RB', 'foot / NN']
NOTE : if the above code raises an error saying ‘generator raised StopIteration‘. Just run it again. It will work after 3-4 tries.
在上面的代码中你可能已经注意到了,gensim lemmatizer 忽略了像'the' 、 'with' 、 'by'这样的词,因为它们不属于上面提到的 4 个词条类别。 (名词/动词/形容词/副词)
9. 斯坦福 CoreNLP
CoreNLP 使用户能够为文本派生语言注释,包括标记和句子边界、词性、命名实体、数字和时间值、依赖和选区解析、情感、引用属性和关系。
- CoreNLP 是您在Java中进行自然语言处理的一站式商店!
- CoreNLP 目前支持 6 种语言,包括阿拉伯语、中文、英语、法语、德语和西班牙语。
How to use:
1. Get JAVA 8 : Download Java 8 (as per your OS) and install it.
2. Get Stanford_coreNLP package :
2.1) Download Stanford_CoreNLP and unzip it.
2.2) Open terminal
(a) go to the directory where you extracted the above file by doing
cd C:\Users\...\stanford-corenlp-4.1.0 on terminal
(b) then, start your Stanford CoreNLP server by executing the following command on terminal:
java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize, ssplit, pos, lemma, parse, sentiment" -port 9000 -timeout 30000
**(leave your terminal open as long as you use this lemmatizer)**
3. Download Standford CoreNLP package: Open your anaconda prompt or terminal, type:
代码:
Python3
from stanfordcorenlp import StanfordCoreNLP
import json
# Connect to the CoreNLP server we just started
nlp = StanfordCoreNLP('http://localhost', port = 9000, timeout = 30000)
# Define properties needed to get lemma
props = {'annotators': 'pos, lemma', 'pipelineLanguage': 'en', 'outputFormat': 'json'}
sentence = "the bats saw the cats with best stripes hanging upside down by their feet"
parsed_str = nlp.annotate(sentence, properties = props)
print(parsed_str)
#> "sentences": [{"index": 0,
# "tokens": [
# {
# "index": 1,
# "word": "the",
# "originalText": "the",
# "lemma": "the", <--------------- LEMMA
# "characterOffsetBegin": 0,
# "characterOffsetEnd": 3,
# "pos": "DT",
# "before": "",
# "after": " "
# },
# {
# "index": 2,
# "word": "bats",
# "originalText": "bats",
# "lemma": "bat", <--------------- LEMMA
# "characterOffsetBegin": 4,
# "characterOffsetEnd": 8,
# "pos": "NNS",
# "before": " ",
# "after": " "
# },
# {
# "index": 3,
# "word": "saw",
# "originalText": "saw",
# "lemma": "see", <--------------- LEMMA
# "characterOffsetBegin": 9,
# "characterOffsetEnd": 12,
# "pos": "VBD",
# "before": " ",
# "after": " "
# },
# {
# "index": 4,
# "word": "the",
# "originalText": "the",
# "lemma": "the", <--------------- LEMMA
# "characterOffsetBegin": 13,
# "characterOffsetEnd": 16,
# "pos": "DT",
# "before": " ",
# "after": " "
# },
# {
# "index": 5,
# "word": "cats",
# "originalText": "cats",
# "lemma": "cat", <--------------- LEMMA
# "characterOffsetBegin": 17,
# "characterOffsetEnd": 21,
# "pos": "NNS",
# "before": " ",
# "after": " "
# },
# {
# "index": 6,
# "word": "with",
# "originalText": "with",
# "lemma": "with", <--------------- LEMMA
# "characterOffsetBegin": 22,
# "characterOffsetEnd": 26,
# "pos": "IN",
# "before": " ",
# "after": " "
# },
# {
# "index": 7,
# "word": "best",
# "originalText": "best",
# "lemma": "best", <--------------- LEMMA
# "characterOffsetBegin": 27,
# "characterOffsetEnd": 31,
# "pos": "JJS",
# "before": " ",
# "after": " "
# },
# {
# "index": 8,
# "word": "stripes",
# "originalText": "stripes",
# "lemma": "stripe", <--------------- LEMMA
# "characterOffsetBegin": 32,
# "characterOffsetEnd": 39,
# "pos": "NNS",
# "before": " ",
# "after": " "
# },
# {
# "index": 9,
# "word": "hanging",
# "originalText": "hanging",
# "lemma": "hang", <--------------- LEMMA
# "characterOffsetBegin": 40,
# "characterOffsetEnd": 47,
# "pos": "VBG",
# "before": " ",
# "after": " "
# },
# {
# "index": 10,
# "word": "upside",
# "originalText": "upside",
# "lemma": "upside", <--------------- LEMMA
# "characterOffsetBegin": 48,
# "characterOffsetEnd": 54,
# "pos": "RB",
# "before": " ",
# "after": " "
# },
# {
# "index": 11,
# "word": "down",
# "originalText": "down",
# "lemma": "down", <--------------- LEMMA
# "characterOffsetBegin": 55,
# "characterOffsetEnd": 59,
# "pos": "RB",
# "before": " ",
# "after": " "
# },
# {
# "index": 12,
# "word": "by",
# "originalText": "by",
# "lemma": "by", <--------------- LEMMA
# "characterOffsetBegin": 60,
# "characterOffsetEnd": 62,
# "pos": "IN",
# "before": " ",
# "after": " "
# },
# {
# "index": 13,
# "word": "their",
# "originalText": "their",
# "lemma": "they"#, <--------------- LEMMA
# "characterOffsetBegin": 63,
# "characterOffsetEnd": 68,
# "pos": "PRP$",
# "before": " ",
# "after": " "
# },
# {
# "index": 14,
# "word": "feet",
# "originalText": "feet",
# "lemma": "foot", <--------------- LEMMA
# "characterOffsetBegin": 69,
# "characterOffsetEnd": 73,
# "pos": "NNS",
# "before": " ",
# "after": ""
# }
# ]
# }
# ]
代码:
Python3
# To get the lemmatized sentence as output
# ** RUN THE ABOVE SCRIPT FIRST **
lemma_list = []
for item in parsed_dict['sentences'][0]['tokens']:
for key, value in item.items():
if key == 'lemma':
lemma_list.append(value)
print(lemma_list)
#> ['the', 'bat', 'see', 'the', 'cat', 'with', 'best', 'stripe', 'hang', 'upside', 'down', 'by', 'they', 'foot']
lemmatized_sentence = " ".join(lemma_list)
print(lemmatized_sentence)
#>the bat see the cat with best stripe hang upside down by the foot
结论:
因此,这些是您在处理 NLP 项目时可以参考的各种 Lemmatization 方法。 Lemmatization 方法的选择完全取决于项目要求。每种方法都有其优点和缺点。对于句子结构很重要的关键项目,如语言应用程序等,词形还原是强制性的。