diff --git a/chatmastermind/glossary.py b/chatmastermind/glossary.py new file mode 100644 index 0000000..139666d --- /dev/null +++ b/chatmastermind/glossary.py @@ -0,0 +1,128 @@ +""" +Module implementing glossaries for translations. +""" +import yaml +import tempfile +import shutil +import csv +from pathlib import Path +from dataclasses import dataclass, field +from typing import Type, TypeVar + +GlossaryInst = TypeVar('GlossaryInst', bound='Glossary') + + +class GlossaryError(Exception): + pass + + +def str_presenter(dumper: yaml.Dumper, data: str) -> yaml.ScalarNode: + """ + Changes the YAML dump style to multiline syntax for multiline strings. + """ + if len(data.splitlines()) > 1: + return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') + return dumper.represent_scalar('tag:yaml.org,2002:str', data) + + +@dataclass +class Glossary: + """ + A glossary consists of the following parameters: + - Name (freely selectable) + - Path (full file path) + - Source language + - Target language + - Entries (pairs of source lang and target lang terms) + - ID (automatically generated / modified, required by DeepL) + """ + + name: str + source_lang: str + target_lang: str + entries: dict[str, str] = field(default_factory=lambda: dict()) + file_path: Path | None = None + ID: str | None = None + + @classmethod + def from_file(cls: Type[GlossaryInst], file_path: Path) -> GlossaryInst: + """ + Create a glossary from the given file. + """ + with open(file_path, "r") as fd: + try: + data = yaml.load(fd, Loader=yaml.FullLoader) + # remove any quotes from the entries that YAML may have added while dumping + # (e. g. for special keywords like 'yes') + clean_entries = {key.strip('\"\' '): value for key, value in data['Entries'].items()} + return cls(name=data['Name'], + source_lang=data['SourceLang'], + target_lang=data['TargetLang'], + entries=clean_entries, + file_path=file_path, + ID=data['ID'] if data['ID'] != 'None' else None) + except Exception: + raise GlossaryError(f"'{file_path}' does not contain a valid glossary") + + def to_file(self, file_path: Path | None = None) -> None: + """ + Write glossary to given file. + """ + if file_path: + self.file_path = file_path + if not self.file_path: + raise GlossaryError("Got no valid path to write glossary") + # write YAML + with tempfile.NamedTemporaryFile(dir=self.file_path.parent, prefix=self.file_path.name, mode="w", delete=False) as temp_fd: + temp_file_path = Path(temp_fd.name) + data = {'Name': self.name, + 'ID': str(self.ID), + 'SourceLang': self.source_lang, + 'TargetLang': self.target_lang, + 'Entries': self.entries} + yaml.dump(data, temp_fd, sort_keys=False) + shutil.move(temp_file_path, self.file_path) + + def export_csv(self, dictionary: dict[str, str], file_path: Path) -> None: + """ + Export the 'entries' of this glossary to a file in CSV format (compatible with DeepL). + """ + with open(file_path, 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) + for source_entry, target_entry in self.entries.items(): + writer.writerow([source_entry, target_entry]) + + def export_tsv(self, entries: dict[str, str], file_path: Path) -> None: + """ + Export the 'entries' of this glossary to a file in TSV format (compatible with DeepL). + """ + with open(file_path, 'w', encoding='utf-8') as file: + for source_entry, target_entry in self.entries.items(): + file.write(f"{source_entry}\t{target_entry}\n") + + def import_csv(self, file_path: Path) -> None: + """ + Import the entries from the given CSV file to those of the current glossary. + Existing entries are overwritten. + """ + try: + with open(file_path, mode='r', encoding='utf-8') as csvfile: + reader = csv.reader(csvfile, delimiter=',', quotechar='"') + self.entries = {rows[0]: rows[1] for rows in reader if len(rows) >= 2} + except Exception as e: + raise GlossaryError(f"Error importing CSV: {e}") + + def import_tsv(self, file_path: Path) -> None: + """ + Import the entries from the given CSV file to those of the current glossary. + Existing entries are overwritten. + """ + try: + with open(file_path, mode='r', encoding='utf-8') as tsvfile: + self.entries = {} + for line in tsvfile: + parts = line.strip().split('\t') + if len(parts) == 2: + self.entries[parts[0]] = parts[1] + except Exception as e: + raise GlossaryError(f"Error importing TSV: {e}")