Welcome!

Welcome to AllenNLP! This tutorial will walk you through the basics of building and training an AllenNLP model. Before we get started, make sure you have a clean Python 3.6 or 3.7 virtual environment, and then run
pip install allennlp
to install the AllenNLP library. In this tutorial we'll implement a slightly enhanced version of the PyTorch LSTM for Part-of-Speech Tagging tutorial, adding some features that make it a slightly more realistic task (and that also showcase some of the benefits of AllenNLP):
  1. read data from files (rather than having it hardcoded)
  2. use a validation dataset that's distinct from the training dataset
  3. use tqdm to track the training progress (and corresponding loss metric)
  4. implement early stopping based on validation loss
  5. track accuracy during training and validation

The Problem

Given a sentence (e.g. "The dog ate the apple") we want to predict part-of-speech tags for each word (e.g ["DET", "NN", "V", "DET", "NN"]). As in the PyTorch tutorial, we'll embed each word in a low-dimensional space, pass them through an LSTM to get a sequence of encodings, and use a feedforward layer to transform those into a sequence of logits (corresponding to the possible part-of-speech tags). The code is simple enough that we'll just annotate it as we go.
# pylint: disable=invalid-name,arguments-differ,redefined-outer-name from typing import Iterator, List, Dict We use type annotations all over our code, so we have to import a few things here.
import torch import torch.optim as optim import numpy as np AllenNLP is built on PyTorch, so we'll import a few of its features to use directly.
from allennlp.data import Instance from allennlp.data.fields import TextField, SequenceLabelField In AllenNLP we represent each training example as an Instance consisting of Fields of various types. Here each example is a sentence (which we store in a TextField) with corresponding part-of-speech tags (which we store in a SequenceLabelField.)
from allennlp.data.dataset_readers import DatasetReader from allennlp.data.iterators import BasicIterator from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer from allennlp.data.tokenizers import Token from allennlp.data.vocabulary import Vocabulary from allennlp.models import Model from allennlp.modules.feedforward import FeedForward from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder from allennlp.modules.token_embedders import Embedding from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits from allennlp.predictors import SentenceTaggerPredictor from allennlp.training.metrics import CategoricalAccuracy from allennlp.training.trainer import Trainer
torch.manual_seed(1)
class PosDatasetReader(DatasetReader): """ DatasetReader for PoS tagging data, one sentence per line, like The###DET dog###NN ate###V the###DET apple###NN """ def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy=False) self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
def text_to_instance(self, tokens: List[Token], tags: List[str] = None) -> Instance: sentence_field = TextField(tokens, self.token_indexers) fields = {"sentence": sentence_field}
if tags: label_field = SequenceLabelField(labels=tags, sequence_field=sentence_field) fields["labels"] = label_field
return Instance(fields)
def _read(self, file_path: str) -> Iterator[Instance]: with open(file_path) as f: for line in f: pairs = line.strip().split() sentence, tags = zip(*(pair.split("###") for pair in pairs)) yield self.text_to_instance([Token(word) for word in sentence], tags)
reader = PosDatasetReader() train_dataset = reader.read('tutorials/tagger/training.txt') validation_dataset = reader.read('tutorials/tagger/validation.txt') vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
EMBEDDING_DIM = 6 HIDDEN_DIM = 6
class LstmTagger(Model): def __init__(self, word_embeddings: TextFieldEmbedder, encoder: Seq2SeqEncoder, vocab: Vocabulary) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.encoder = encoder self.hidden2tag = FeedForward(input_dim=encoder.get_output_dim(), num_layers=1, hidden_dims=vocab.get_vocab_size('labels'), activations=lambda x: x) self.accuracy = CategoricalAccuracy()
def forward(self, sentence: torch.Tensor, labels: torch.Tensor = None) -> torch.Tensor: embeddings = self.word_embeddings(sentence) mask = get_text_field_mask(sentence) encoder_out = self.encoder(embeddings, mask) tag_logits = self.hidden2tag(encoder_out) output = {"tag_logits": tag_logits}
if labels is not None: self.accuracy(tag_logits, labels, mask) output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)
return output
def get_metrics(self, reset: bool = False) -> Dict[str, float]: return {"accuracy": self.accuracy.get_metric(reset)}
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
model = LstmTagger(word_embeddings, lstm, vocab) optimizer = optim.SGD(model.parameters(), lr=0.1)
iterator = BasicIterator(batch_size=2) iterator.index_with(vocab)
trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=1000)
# No need to see what the scores are before training, # our trainer will show the loss over time.
# Train trainer.train()
# See what the scores are after training # Make predictions predictor = SentenceTaggerPredictor(model, dataset_reader=reader) tag_scores = predictor.predict("The dog ate the apple")['tag_logits'] print(tag_scores) tag_ids = np.argmax(tag_scores, axis=-1) print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])