Source code for ajmc.corpora.cleaning_utils
import re
import typing
from collections import Counter
from pathlib import Path
[docs]
def find_recurrent_lines(path: str,
n_first_elements: typing.Optional[int] = None,
recurrence_threshold: typing.Optional[int] = None):
text = Path(path).read_text(encoding='utf-8')
lines = [line for line in text.split('\n') if line.strip()]
line_counts = Counter(lines)
if n_first_elements:
print(line_counts.most_common(n_first_elements))
if recurrence_threshold:
print({line: count for line, count in line_counts.items() if count >= recurrence_threshold})
[docs]
def basic_clean(text: str) -> str:
text = re.sub(r'\s+', ' ', text) # Removes multiple spaces
text = text.strip() # Removes leading and trailing whitespace
text = re.sub(r'https?://\S+', '', text) # Removes URLs
return text
[docs]
def harmonise_linebreaks(text: str) -> str:
text = re.sub(r'\n{3,}', '||||', text)
text = re.sub(r'\n\n', '@@@@', text)
text = re.sub(r'\n', ' ', text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'@@@@', '\n\n', text)
text = re.sub(r'\|\|\|\|', '\n\n\n', text)
return text