Source code for ajmc.commons.docstrings

"""This file contains generic docstring chunks to be formatted using``docstring_formatter``."""


[docs] def docstring_formatter(**kwargs): """Decorator with arguments used to format the docstring of a functions. ``docstring_formatter`` is a decorator with arguments, which means that it takes any set of ``kwargs`` as argument and returns a decorator. It should therefore always be called with parentheses (unlike traditional decorators - see below). It follows the grammar of ``str.format()``, i.e. ``{my_format_value}``. grammar. Example: This code snippet: .. code-block:: Python @docstring_formatter(greeting = 'hello') def my_func(): "A simple greeter that says {greeting}" # Do your stuff Is actually equivalent with : .. code-block:: Python def my_func(): "A simple greeter that says {greeting}" # Do your stuff my_func.__doc__ = my_func.__doc__.format(greeting = 'hello') Note: Best practice is to name your arguments in compliance with ``docstrings.docstrings`` in order to simply call ``@doctring_formatter(**docstrings.docstrings)``. """ def inner_decorator(func): func.__doc__ = func.__doc__.format(**kwargs) return func return inner_decorator
docstrings = dict() # Creating docstrings on the fly in order to refer to previously declared elements. docstrings['artifact_size_threshold'] = f"""Size-threshold under which contours are to be considered as artifacts and removed, expressed as a \ percentage of image height. Default is 0.003""" docstrings['BatchEncoding'] = """The default ouput of HuggingFace's ``TokenizerFast``. As the \ `docs <https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/tokenizer#transformers.BatchEncoding>`_ have it, *"This class is derived from \ a python dictionary and can be used as a dictionary. In addition, this class exposes utility methods to map from word/character space to token \ space"*. The object contains ``data`` and ``encodings``. Data is directly callable and has the form of a ``Dict[str, List[List[int]]]`` where keys \ are model inputs. Encodings is a list of example, containing notably the offsets. Please note that not using a ``TokenizerFast`` (i.e. using a \ ``Tokenizer`` instead) can lead to the cumbersome situation in which ``self.encodings`` is set to ``None``.""", docstrings['root_dir'] = """``commentaries_data`` root directory. Use variables.COMMS_DATA_DIR to access the default value.""" docstrings['bbox'] = """A tuple of two (x, y) tuples representing upper-left and lower-right coordinates.""" docstrings['children_type'] = """The type of children to get. Must be one of ``pages``, ``regions``, ``lines`` or ``words``.""" docstrings[ 'commentary_id'] = """The id of the commentary (e.g. sophoclesplaysa05campgoog). Ids are listed in ``ajmc.commons.variables.ALL_COMM_IDS``.""" docstrings['coords_single'] = 'A ``Shape`` object representing the coordinates of the object.' docstrings['comms_root_dir'] = """The base directory a commentaries data, normally ``variables.COMMS_DATA_DIR / [comm_id]``. Use \ ``variables.get_comm_root_dir()`` to retrieve it.""" docstrings['custom_dataset'] = """A dataset inheriting from ``torch.utils.data.Dataset``, implementing at least ``__len__`` and \ ``__getitem__()``, where each item is a dict alike ``{{'model_input': tensor(), ...}}`` corresponding \ to a single example.""" docstrings['dilation_kernel_size'] = """Dilation kernel size, preferably an odd number. Tweak \ this parameter and ``dilation_iterations`` to improve automatic boxing. Starting with 25 is recommended.""" docstrings['dilation_iterations'] = 'Number of iterations in dilation, default 1' docstrings['directory'] = 'The absolute path to the directory' docstrings['do_debug'] = """Whether break loops after the first iteration.""" docstrings['groundtruth_dir'] = 'The absolute path to the directory containg groundtruth files.' docstrings['ids_to_labels'] = """A dict mapping the label numbers (int) used by the model \ to the original label names (str), e.g. ``{{0: "O", 1: "B-PERS", ...}}``""" docstrings['image_dir'] = 'The absolute path to the directory containing the images.' docstrings['image_path'] = 'The absolute path to the image.' docstrings['img_extension'] = """The extension of image files, including the ``.``, e.g. ``'.png'`` or ``'.jpg'``.""" docstrings['interval'] = 'A ``Tuple[int, int]`` defining the included boundaries of an interval, with start <= stop.' docstrings['kwargs_for_properties'] = 'Use **kwargs to manually set or override properties.' docstrings['labels_to_ids'] = 'A dict mapping label-names to their respective ids, e.g. ``{{"cat":0, "dog":1, ...}}``.' docstrings['ocr_dir'] = 'The absolute path to the directory containing OCR outputs.' docstrings['ocr_path'] = 'The absolute path to an ocr output file.' docstrings['ocr_run_id'] = """The id of an ocr-run, e.g. '28o09e_tess_base', (generally follow the pattern \ '{get_62_based_datecode()}_{ocr_engine}_{ocr_model}').""" docstrings['olr_region_type'] = """The type of the Region (i.e. ``'primary_text'`` or ``'commentary'``).""" docstrings['parent_page'] = """"The ``RawPage`` containing the object""" docstrings['parent_type'] = """"The type of the parent object. Must be one of ``commentary``, ``page``, ``region`` or ``line``.""" docstrings['path'] = 'The absolute path' docstrings['point'] = 'Iterable containing x and y coordinates (e.g. ``(123, 87)``' docstrings['points'] = f"""Iterable of iterable containing x-y points (e.g. ``[(12,8), (15,16), ...]``. """ docstrings['transformers_model'] = """A ``transformers.models``.""" docstrings['transformers_model_inputs_names'] = """The name of the inputs required by the model, e.g. ``['input_ids', 'attention_mask',...]``.""" docstrings['transformers_model_inputs'] = """A mapping to between the names of the model's requirements and ``torch.Tensor`` of size \ (max_length, batch_size). Example: ``{'input_ids': torch.tensor([[int, int, ...], [int, int, ...]])``.""" docstrings['transformers_model_predictions'] = """``np.ndarray`` containing the predicted labels, so in the shape (number of exs, length of an ex).""" docstrings['max_length'] = 'The maximum length of a sequence to be processed by the model.' docstrings['sections_path'] = 'The absolute path to the sections json-file.' docstrings['sheet_id'] = """The id of the spreadsheet, i.e. the part of the url after 'spreadsheets/d/'. Check ``commons.variables.SPREADSHEET_IDS`` \ for examples.""" docstrings['sheet_name'] = """The name of the sheet in the spreadsheet, for instance 'Sheet1' or ``olr_gt``.""" docstrings['special_tokens'] = """LEGACY. A dict containing the model's special token for sequence start, end and pad, for instance \ ``{{'start': {{'input_ids':100, ...}}, ...}}``""" docstrings['via_dict'] = """A via-json compliant dict. Should look like: .. code-block:: python { 'shape_attributes': {'name': 'rect', 'x': 31, 'y': 54, 'width': 1230, 'height': 453}, 'region_attributes': {'text': 'preface'} """ docstrings['via_path'] = 'The absolute path to the via_project json.' docstrings['via_project'] = """A dictionary resulting from the reading of a via_project JSON file. Visit \ https://www.robots.ox.ac.uk/~vgg/software/via/ for more information.""" docstrings['word_range'] = """A tuple of two ints representing the start and end of the object in the commentary's text."""