Source code for orangecontrib.text.tag.pos

from typing import List, Callable

import nltk
import numpy as np
from Orange.util import wrap_callback, dummy_callback

from orangecontrib.text import Corpus
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import TokenizedPreprocessor
from orangecontrib.text.util import chunkable


__all__ = ["POSTagger", "AveragedPerceptronTagger", "MaxEntTagger"]


[docs] class POSTagger(TokenizedPreprocessor): """A class that wraps `nltk.TaggerI` and performs Corpus tagging. """
[docs] def __init__(self, tagger): self.tagger = tagger.tag_sents
def __call__(self, corpus: Corpus, callback: Callable = None, **kw) -> Corpus: """ Marks tokens of a corpus with POS tags. """ if callback is None: callback = dummy_callback corpus = super().__call__(corpus, wrap_callback(callback, end=0.2)) assert corpus.has_tokens() callback(0.2, "POS Tagging...") tags = np.array(self._preprocess(corpus.tokens, **kw), dtype=object) corpus.pos_tags = tags return corpus @chunkable def _preprocess(self, tokens: List[List[str]]) -> List[List[str]]: return list(map(lambda sent: list(map(lambda x: x[1], sent)), self.tagger(tokens)))
class AveragedPerceptronTagger(POSTagger): name = 'Averaged Perceptron Tagger' @wait_nltk_data def __init__(self): super().__init__(nltk.PerceptronTagger()) class MaxEntTagger(POSTagger): name = 'Treebank POS Tagger (MaxEnt)' @wait_nltk_data def __init__(self): tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle') super().__init__(tagger)