Source code for ajmc.ocr.pytorch.ctc_decoder_torch

# From https://github.com/SeanNaren/deepspeech.pytorch, relying on https://github.com/parlance/ctcdecode for Beam Search decoder.

from typing import List

import torch
from six.moves import xrange



[docs]
class Decoder:
    """
    Basic decoder class from which all other decoders inherit. Implements several
    helper functions. Subclasses should implement the decode() method.

    Arguments:
        classes (list): mapping from integers to characters.
        blank_index (int, optional): index for the blank '_' character. Defaults to 0.
    """

    def __init__(self, classes, blank_index=0):
        self.classes = classes
        self.indices_to_classes = dict([(i, c) for (i, c) in enumerate(classes)])
        self.blank_index = blank_index
        space_index = len(classes)  # To prevent errors in decode, we add an out of bounds index for the space
        if ' ' in classes:
            space_index = classes.index(' ')
        self.space_index = space_index


[docs]
    def decode(self, probs, sizes=None):
        """
        Given a matrix of character probabilities, returns the decoder's
        best guess of the transcription

        Arguments:
            probs: Tensor of character probabilities, where probs[c,t]
                            is the probability of character c at time t
            sizes(optional): Size of each sequence in the mini-batch
        Returns:
            string: sequence of the model's best guess for the transcription
        """
        raise NotImplementedError





[docs]
class BeamCTCDecoder(Decoder):
    def __init__(self,
                 classes,
                 lm_path=None,
                 alpha=0,
                 beta=0,
                 cutoff_top_n=40,
                 cutoff_prob=1.0,
                 beam_width=100,
                 num_processes=4,
                 blank_index=0):
        super(BeamCTCDecoder, self).__init__(classes)
        try:
            from ctcdecode import CTCBeamDecoder
        except ImportError:
            raise ImportError("BeamCTCDecoder requires paddledecoder package.")
        classes = list(classes)  # Ensure classes are a list before passing to decoder
        self._decoder = CTCBeamDecoder(classes, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width,
                                       num_processes, blank_index)


[docs]
    def convert_to_strings(self, out, seq_len):
        results = []
        for b, batch in enumerate(out):
            utterances = []
            for p, utt in enumerate(batch):
                size = seq_len[b][p]
                if size > 0:
                    transcript = ''.join(map(lambda x: self.indices_to_classes[x.item()], utt[0:size]))
                else:
                    transcript = ''
                utterances.append(transcript)
            results.append(utterances)
        return results



[docs]
    def convert_tensor(self, offsets, sizes):
        results = []
        for b, batch in enumerate(offsets):
            utterances = []
            for p, utt in enumerate(batch):
                size = sizes[b][p]
                if sizes[b][p] > 0:
                    utterances.append(utt[0:size])
                else:
                    utterances.append(torch.tensor([], dtype=torch.int))
            results.append(utterances)
        return results



[docs]
    def decode(self, probs, sizes=None):
        """Decodes probability output using ctcdecode package.

        Args:
            probs: Tensor of character probabilities, where probs[c,t] is the probability of character c at time t
            sizes: Size of each sequence in the mini-batch

        Returns:
            string: sequences of the model's best guess for the transcription
        """
        probs = probs.cpu()
        out, scores, offsets, seq_lens = self._decoder.decode(probs, sizes)

        strings = self.convert_to_strings(out, seq_lens)
        offsets = self.convert_tensor(offsets, seq_lens)
        return strings, offsets





[docs]
class GreedyDecoder(Decoder):
    def __init__(self, classes, blank_index=0):
        super(GreedyDecoder, self).__init__(classes, blank_index)

    #@profile

[docs]
    def process_string(self,
                       sequence: torch.tensor,
                       size: int,
                       remove_repetitions=False) -> str:
        string = ''
        for i in range(size):
            char = self.indices_to_classes[sequence[i].item()]
            if char != self.indices_to_classes[self.blank_index]:
                # if this char is a repetition and remove_repetitions=true, then skip
                if remove_repetitions and i != 0 and char == self.indices_to_classes[sequence[i - 1].item()]:
                    pass
                elif char == self.classes[self.space_index]:
                    string += ' '
                else:
                    string += char
        return string


    #@profile

[docs]
    def decode(self, probs, sizes=None, remove_repetitions: bool = True) -> List[str]:
        """
        Returns the argmax decoding given the probability matrix. Removes
        repeated elements in the sequence, as well as blanks.

        Arguments:
            probs: Tensor of character probabilities from the network. Expected shape of batch x seq_length x output_dim
            sizes(optional): Size of each sequence in the mini-batch
            remove_repetitions: Whether to remove repeated characters in the probs
        Returns:
            strings: sequences of the model's best guess for the transcription on inputs
            offsets: time step per character predicted
        """
        max_probs = torch.argmax(probs, 2)
        sequences = max_probs.view(max_probs.size(0), max_probs.size(1))

        strings = []
        for x in xrange(len(sequences)):
            seq_len = sizes[x] if sizes is not None else len(sequences[x])
            string = self.process_string(sequences[x], seq_len, remove_repetitions)
            strings.append(string)

        return strings