# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Rouge Score metric for PyTorch.
"""
import string
from collections import Counter
from typing import Optional
import numpy as np
import torch
from nltk import ngrams
from modelzoo.common.pytorch.metrics.cb_metric import CBMetric
from modelzoo.transformers.data_processing.tokenizers.Tokenization import (
FullTokenizer,
)
class _PipelineRougeScoreMetric(CBMetric):
"""Custom evaluation metric for calculating rouge score when performing
text summarization.
ROUGE (Recall-Oriented Understudy for Gisting Evaluation).
* the fraction of n-grams from abstracts included in the summarization.
\begin{equation}
ROUGE-n(s) = \frac{\sum_{r \in R}\sum_{w} [w \in s][w \in r]}{\sum_{r \in R} \sum_{w} [w \in r]}
\end{equation}
* $r \in R$ -- set of abstracts, written by humans.
* $s$ -- abstract, built by the system.
* higher the better -- for all metrics of ROUGE family.
* $n$ -- order of n-gram:
* $n=1$ -- unigrams, $n=2$ -- bigrams, etc.
* with increase of $n$, you achieve more accurate results.
* with $n =$ len_of_abstract, we require full match of predicted
text and the one written by humans.
The num_matched_ngrams, num_references_ngrams, num_hypothesis_ngrams
are accumulated in a rouge matrix, and rouge score (f1, precision, recall)
is then calculated from it.
"""
def __init__(
self, vocab_file: str, max_n: int = 1, name: Optional[str] = None
):
"""
Args:
vocab_file: Path to the vocab file.
max_n: Optional maximum size of n-grams to consider. Default is 1.
name: Name of the metric.
"""
self.max_n = max_n
self.vocab_file = vocab_file
self.tokenizer = FullTokenizer(self.vocab_file)
super().__init__(name=name)
def init_state(self):
self.reset_state()
def reset_state(self):
# We store 3 items: num_matched_ngrams, num_references_ngrams
# and num_hypothesis_ngrams.
self.rouge_matrix = np.zeros((3,), dtype=np.float64)
def update_on_host(
self, labels, predictions, cls_indices, cls_weights, input_ids
):
"""
Compute and aggregate rouge_matrix every iteration.
Each computation comprises of:
1. Convert labels to references.
2. Convert predictions to hypotheses.
3. Convert hypotheses and references to ngrams.
4. Calculate rouge matrix.
"""
def _preprocess_before_rouge(sentences):
def _preprocess_sentence_before_rouge(sentence):
special_words = {"[pad]", "[cls]", "[sep]"}
punctuation_words = set(string.punctuation)
words_to_ignore = punctuation_words | special_words
words_in_sentence = [word.lower() for word in sentence]
words_in_sentence = list(
filter(
lambda word: word not in words_to_ignore,
words_in_sentence,
)
)
return " ".join(words_in_sentence)
words = np.array(
[
_preprocess_sentence_before_rouge(sentence)
for sentence in sentences
]
)
return words
predictions = predictions.detach()
labels = labels.detach()
cls_indices = cls_indices.detach()
cls_weights = cls_weights.detach()
input_ids = input_ids.detach()
hypotheses = extract_text_words_given_cls_indices(
predictions, cls_indices, cls_weights, input_ids, self.tokenizer
)
references = extract_text_words_given_cls_indices(
labels, cls_indices, cls_weights, input_ids, self.tokenizer
)
hypotheses = _preprocess_before_rouge(hypotheses)
references = _preprocess_before_rouge(references)
current_rouge_matrix = np.zeros((3,), dtype=np.float64)
hypotheses = [x.split(" ") for x in hypotheses]
references = [x.split(" ") for x in references]
hypotheses_ngrams = [
ngrams(sentence, self.max_n) for sentence in hypotheses
]
references_ngrams = [
ngrams(sentence, self.max_n) for sentence in references
]
hypotheses_freq = [
Counter(hypotheses_sentence_ngrams)
for hypotheses_sentence_ngrams in hypotheses_ngrams
]
references_freq = [
Counter(references_sentence_ngrams)
for references_sentence_ngrams in references_ngrams
]
matched_ngrams_freq = []
num_matched_ngrams = 0
num_references_ngrams = 0
num_hypotheses_ngrams = 0
# For each sentence, compute the number of matched n-grams, and
# total n-grams in hypotheses and references.
for sent_idx in range(len(hypotheses_freq)):
matched_ngrams_freq.append(
hypotheses_freq[sent_idx] & references_freq[sent_idx]
)
num_matched_ngrams += sum(matched_ngrams_freq[-1].values())
num_references_ngrams += sum(references_freq[sent_idx].values())
num_hypotheses_ngrams += sum(hypotheses_freq[sent_idx].values())
current_rouge_matrix[0] = num_matched_ngrams
current_rouge_matrix[1] = num_references_ngrams
current_rouge_matrix[2] = num_hypotheses_ngrams
self.rouge_matrix += current_rouge_matrix
def compute(self):
"""
Compute the f1, precision, recall score via the rouge matrix.
"""
num_matched_ngrams = self.rouge_matrix[0]
num_references_ngrams = self.rouge_matrix[1]
num_hypotheses_ngrams = self.rouge_matrix[2]
precision = (
num_matched_ngrams if num_hypotheses_ngrams else 0.0
) / num_hypotheses_ngrams
recall = (
num_matched_ngrams if num_references_ngrams else 0.0
) / num_references_ngrams
f1_score = (2 * recall * precision if recall and precision else 0.0) / (
recall + precision
)
return {
"precision": precision,
"recall": recall,
"f1_score": f1_score,
}
def _pad_input_sequence(input_sequence, max_sequence_length):
input_sequence = input_sequence + ["[pad]"] * (
max_sequence_length - len(input_sequence)
)
return np.array(input_sequence, dtype=object)
# Create a factory for creating a metric depending on execution strategy
RougeScoreMetric = CBMetric.create_metric_impl_factory(
pipeline_metric_cls=_PipelineRougeScoreMetric, ws_metric_cls=None
)