ChatMastermind/chatmastermind/glossary.py

"""
Module implementing glossaries for translations.
"""
import yaml
import tempfile
import shutil
import csv
from pathlib import Path
from dataclasses import dataclass, field
from typing import Type, TypeVar, ClassVar

GlossaryInst = TypeVar('GlossaryInst', bound='Glossary')


class GlossaryError(Exception):
    pass


def str_presenter(dumper: yaml.Dumper, data: str) -> yaml.ScalarNode:
    """
    Changes the YAML dump style to multiline syntax for multiline strings.
    """
    if len(data.splitlines()) > 1:
        return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
    return dumper.represent_scalar('tag:yaml.org,2002:str', data)


@dataclass
class Glossary:
    """
    A glossary consists of the following parameters:
        - Name (freely selectable)
        - Path (full file path, suffix is automatically generated)
        - Source language
        - Target language
        - Description (optional)
        - Entries (pairs of source lang and target lang terms)
        - ID (automatically generated / modified, required by DeepL)
    """

    name: str
    source_lang: str
    target_lang: str
    file_path: Path | None = None
    desc: str | None = None
    entries: dict[str, str] = field(default_factory=lambda: dict())
    ID: str | None = None
    file_suffix: ClassVar[str] = '.glo'

    def __post_init__(self) -> None:
        # FIXME: check for valid languages
        pass

    @classmethod
    def from_file(cls: Type[GlossaryInst], file_path: Path) -> GlossaryInst:
        """
        Create a glossary from the given file.
        """
        if not file_path.exists():
            raise GlossaryError(f"Glossary file '{file_path}' does not exist")
        if file_path.suffix != cls.file_suffix:
            raise GlossaryError(f"File type '{file_path.suffix}' is not supported")
        with open(file_path, "r") as fd:
            try:
                # use BaseLoader so every entry is read as a string
                # - disables automatic conversions
                # - makes it possible to omit quoting for YAML keywords in entries (e. g. 'yes')
                # - also correctly reads quoted entries
                data = yaml.load(fd, Loader=yaml.BaseLoader)
                clean_entries = data['Entries']
                return cls(name=data['Name'],
                           source_lang=data['SourceLang'],
                           target_lang=data['TargetLang'],
                           file_path=file_path,
                           desc=data['Description'],
                           entries=clean_entries,
                           ID=data['ID'] if data['ID'] != 'None' else None)
            except Exception:
                raise GlossaryError(f"'{file_path}' does not contain a valid glossary")

    def to_file(self, file_path: Path | None = None) -> None:
        """
        Write glossary to given file.
        """
        if file_path:
            self.file_path = file_path
        if not self.file_path:
            raise GlossaryError("Got no valid path to write glossary")
        # check / add valid suffix
        if not self.file_path.suffix:
            self.file_path = self.file_path.with_suffix(self.file_suffix)
        elif self.file_path.suffix != self.file_suffix:
            raise GlossaryError(f"File suffix '{self.file_path.suffix}' is not supported")
        # write YAML
        with tempfile.NamedTemporaryFile(dir=self.file_path.parent, prefix=self.file_path.name, mode="w", delete=False) as temp_fd:
            temp_file_path = Path(temp_fd.name)
            data = {'Name': self.name,
                    'Description': str(self.desc),
                    'ID': str(self.ID),
                    'SourceLang': self.source_lang,
                    'TargetLang': self.target_lang,
                    'Entries': self.entries}
            yaml.dump(data, temp_fd, sort_keys=False)
        shutil.move(temp_file_path, self.file_path)

    def export_csv(self, dictionary: dict[str, str], file_path: Path) -> None:
        """
        Export the 'entries' of this glossary to a file in CSV format (compatible with DeepL).
        """
        with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
            for source_entry, target_entry in self.entries.items():
                writer.writerow([source_entry, target_entry])

    def export_tsv(self, entries: dict[str, str], file_path: Path) -> None:
        """
        Export the 'entries' of this glossary to a file in TSV format (compatible with DeepL).
        """
        with open(file_path, 'w', encoding='utf-8') as file:
            for source_entry, target_entry in self.entries.items():
                file.write(f"{source_entry}\t{target_entry}\n")

    def import_csv(self, file_path: Path) -> None:
        """
        Import the entries from the given CSV file to those of the current glossary.
        Existing entries are overwritten.
        """
        try:
            with open(file_path, mode='r', encoding='utf-8') as csvfile:
                reader = csv.reader(csvfile, delimiter=',', quotechar='"')
                self.entries = {rows[0]: rows[1] for rows in reader if len(rows) >= 2}
        except Exception as e:
            raise GlossaryError(f"Error importing CSV: {e}")

    def import_tsv(self, file_path: Path) -> None:
        """
        Import the entries from the given CSV file to those of the current glossary.
        Existing entries are overwritten.
        """
        try:
            with open(file_path, mode='r', encoding='utf-8') as tsvfile:
                self.entries = {}
                for line in tsvfile:
                    parts = line.strip().split('\t')
                    if len(parts) == 2:
                        self.entries[parts[0]] = parts[1]
        except Exception as e:
            raise GlossaryError(f"Error importing TSV: {e}")

    def to_str(self, with_entries: bool = False) -> str:
        """
        Return the current glossary as a string.
        """
        output: list[str] = []
        output.append(f'{self.name} (ID: {self.ID}):')
        if self.desc and self.desc != 'None':
            output.append('- ' + self.desc)
        output.append(f'- Languages: {self.source_lang} -> {self.target_lang}')
        if with_entries:
            output.append('- Entries:')
            for source, target in self.entries.items():
                output.append(f'  {source} : {target}')
        else:
            output.append(f'- Entries: {len(self.entries)}')
        return '\n'.join(output)