Source code for ajmc.corpora.bibliographic_records

"""⚙️ WIP code process bibliographic records"""

from pathlib import Path
from typing import Union, List

from bs4 import BeautifulSoup


# from lazy_objects.lazy_objects import lazy_property, lazy_init


[docs] class DublinCoreRecord: #@lazy_init def __init__(self, soup: BeautifulSoup): pass
[docs] def get_property_tag_text(self, tag_name: str) -> str: return self.soup.find(tag_name).text.lower()
#@lazy_property
[docs] def title(self) -> str: return self.get_property_tag_text('dcterms:title')
#@lazy_property
[docs] def creator(self) -> str: return self.get_property_tag_text('dcterms:creator')
#@lazy_property
[docs] def publisher(self) -> str: return self.get_property_tag_text('dcterms:publisher')
#@lazy_property
[docs] def language(self) -> str: return self.get_property_tag_text('dcterms:language')
#@lazy_property
[docs] def keywords(self) -> List[str]: return [s.text.lower() for s in self.soup.find_all('dcterms:subject')]
#@lazy_property
[docs] def keywords_string(self) -> str: return ' '.join(self.keywords)
#@lazy_property
[docs] def description(self) -> str: return self.get_property_tag_text('dcterms:description')
#@lazy_property
[docs] def whole_text(self) -> str: return ' '.join( [self.title, self.creator, self.publisher, self.language, self.keywords_string, self.description])
[docs] def get_records_list(xmls_dir: Union[Path, str]) -> List[BeautifulSoup]: records = [] for path in Path(xmls_dir).glob('*.xml'): soup = BeautifulSoup(path.read_text(), features='xml') for record in soup.find_all('record'): records.append(record) return records