"""This file contains generic docstring chunks to be formatted using``docstring_formatter``."""
docstrings = dict() # Creating docstrings on the fly in order to refer to previously declared elements.
docstrings['artifact_size_threshold'] = f"""Size-threshold under which contours are to be considered as artifacts and removed, expressed as a \
percentage of image height. Default is 0.003"""
docstrings['BatchEncoding'] = """The default ouput of HuggingFace's ``TokenizerFast``. As the \
`docs <https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/tokenizer#transformers.BatchEncoding>`_ have it, *"This class is derived from \
a python dictionary and can be used as a dictionary. In addition, this class exposes utility methods to map from word/character space to token \
space"*. The object contains ``data`` and ``encodings``. Data is directly callable and has the form of a ``Dict[str, List[List[int]]]`` where keys \
are model inputs. Encodings is a list of example, containing notably the offsets. Please note that not using a ``TokenizerFast`` (i.e. using a \
``Tokenizer`` instead) can lead to the cumbersome situation in which ``self.encodings`` is set to ``None``.""",
docstrings['root_dir'] = """``commentaries_data`` root directory. Use variables.COMMS_DATA_DIR to access the default value."""
docstrings['bbox'] = """A tuple of two (x, y) tuples representing upper-left and lower-right coordinates."""
docstrings['children_type'] = """The type of children to get. Must be one of ``pages``, ``regions``, ``lines`` or ``words``."""
docstrings[
'commentary_id'] = """The id of the commentary (e.g. sophoclesplaysa05campgoog). Ids are listed in ``ajmc.commons.variables.ALL_COMM_IDS``."""
docstrings['coords_single'] = 'A ``Shape`` object representing the coordinates of the object.'
docstrings['comms_root_dir'] = """The base directory a commentaries data, normally ``variables.COMMS_DATA_DIR / [comm_id]``. Use \
``variables.get_comm_root_dir()`` to retrieve it."""
docstrings['custom_dataset'] = """A dataset inheriting from ``torch.utils.data.Dataset``, implementing at least ``__len__`` and \
``__getitem__()``, where each item is a dict alike ``{{'model_input': tensor(), ...}}`` corresponding \
to a single example."""
docstrings['dilation_kernel_size'] = """Dilation kernel size, preferably an odd number. Tweak \
this parameter and ``dilation_iterations`` to improve automatic boxing. Starting with 25 is recommended."""
docstrings['dilation_iterations'] = 'Number of iterations in dilation, default 1'
docstrings['directory'] = 'The absolute path to the directory'
docstrings['do_debug'] = """Whether break loops after the first iteration."""
docstrings['groundtruth_dir'] = 'The absolute path to the directory containg groundtruth files.'
docstrings['ids_to_labels'] = """A dict mapping the label numbers (int) used by the model \
to the original label names (str), e.g. ``{{0: "O", 1: "B-PERS", ...}}``"""
docstrings['image_dir'] = 'The absolute path to the directory containing the images.'
docstrings['image_path'] = 'The absolute path to the image.'
docstrings['img_extension'] = """The extension of image files, including the ``.``, e.g. ``'.png'`` or ``'.jpg'``."""
docstrings['interval'] = 'A ``Tuple[int, int]`` defining the included boundaries of an interval, with start <= stop.'
docstrings['kwargs_for_properties'] = 'Use **kwargs to manually set or override properties.'
docstrings['labels_to_ids'] = 'A dict mapping label-names to their respective ids, e.g. ``{{"cat":0, "dog":1, ...}}``.'
docstrings['ocr_dir'] = 'The absolute path to the directory containing OCR outputs.'
docstrings['ocr_path'] = 'The absolute path to an ocr output file.'
docstrings['ocr_run_id'] = """The id of an ocr-run, e.g. '28o09e_tess_base', (generally follow the pattern \
'{get_62_based_datecode()}_{ocr_engine}_{ocr_model}')."""
docstrings['olr_region_type'] = """The type of the Region (i.e. ``'primary_text'`` or ``'commentary'``)."""
docstrings['parent_page'] = """"The ``RawPage`` containing the object"""
docstrings['parent_type'] = """"The type of the parent object. Must be one of ``commentary``, ``page``, ``region`` or ``line``."""
docstrings['path'] = 'The absolute path'
docstrings['point'] = 'Iterable containing x and y coordinates (e.g. ``(123, 87)``'
docstrings['points'] = f"""Iterable of iterable containing x-y points (e.g. ``[(12,8), (15,16), ...]``. """
docstrings['transformers_model'] = """A ``transformers.models``."""
docstrings['transformers_model_inputs_names'] = """The name of the inputs required by the model, e.g. ``['input_ids', 'attention_mask',...]``."""
docstrings['transformers_model_inputs'] = """A mapping to between the names of the model's requirements and ``torch.Tensor`` of size \
(max_length, batch_size).
Example:
``{'input_ids': torch.tensor([[int, int, ...], [int, int, ...]])``."""
docstrings['transformers_model_predictions'] = """``np.ndarray`` containing the predicted labels, so in the shape (number of exs, length of an ex)."""
docstrings['max_length'] = 'The maximum length of a sequence to be processed by the model.'
docstrings['sections_path'] = 'The absolute path to the sections json-file.'
docstrings['sheet_id'] = """The id of the spreadsheet, i.e. the part of the url after 'spreadsheets/d/'. Check ``commons.variables.SPREADSHEET_IDS`` \
for examples."""
docstrings['sheet_name'] = """The name of the sheet in the spreadsheet, for instance 'Sheet1' or ``olr_gt``."""
docstrings['special_tokens'] = """LEGACY. A dict containing the model's special token for sequence start, end and pad, for instance \
``{{'start': {{'input_ids':100, ...}}, ...}}``"""
docstrings['via_dict'] = """A via-json compliant dict. Should look like:
.. code-block:: python
{ 'shape_attributes': {'name': 'rect', 'x': 31, 'y': 54, 'width': 1230, 'height': 453},
'region_attributes': {'text': 'preface'}
"""
docstrings['via_path'] = 'The absolute path to the via_project json.'
docstrings['via_project'] = """A dictionary resulting from the reading of a via_project JSON file. Visit \
https://www.robots.ox.ac.uk/~vgg/software/via/ for more information."""
docstrings['word_range'] = """A tuple of two ints representing the start and end of the object in the commentary's text."""