Source code for graphtransliterator.core

# -*- coding: utf-8 -*-

"""
GraphTransliterator core classes.
"""
from .ambiguity import check_for_ambiguity
from .compression import compress_config, decompress_config
from .exceptions import (
    IncompleteOnMatchRulesCoverageException,
    IncorrectVersionException,
    NoMatchingTransliterationRuleException,
    UnrecognizableInputTokenException,
)
from .graphs import VisitLoggingDirectedGraph, VisitLoggingList
from .initialize import (
    _graph_from,
    _onmatch_rules_lookup,
    _tokenizer_pattern_from,
    _tokens_by_class_of,
    _unescape_charnames,
)
from .process import _process_easyreading_settings
from .schemas import (
    DirectedGraphSchema,
    EasyReadingSettingsSchema,
    OnMatchRuleSchema,
    SettingsSchema,
    TransliterationRuleSchema,
    WhitespaceSettingsSchema,
)
from collections import deque

from graphtransliterator import __version__ as __version__
import json
import logging
from marshmallow import (
    fields,
    pre_load,
    post_load,
    Schema,
    validates_schema,
    ValidationError,
)
import re
import yaml

logger = logging.getLogger("graphtransliterator")

DEFAULT_COMPRESSION_LEVEL = 2
HIGHEST_COMPRESSION_LEVEL = 2


[docs] class GraphTransliteratorSchema(Schema): """Schema for Graph Transliterator.""" tokens = fields.Dict( keys=fields.Str(), values=fields.List(fields.Str()), required=True ) rules = fields.Nested(TransliterationRuleSchema, many=True, required=True) whitespace = fields.Nested(WhitespaceSettingsSchema, many=False, required=True) onmatch_rules = fields.Nested( OnMatchRuleSchema, many=True, required=False, allow_none=True ) metadata = fields.Dict( keys=fields.Str(), required=False # No restriction on values ) ignore_errors = fields.Bool(required=False) onmatch_rules_lookup = fields.Dict(required=False, allow_none=True) tokens_by_class = fields.Dict( keys=fields.Str(), values=fields.List(fields.Str), required=False ) graph = fields.Nested( DirectedGraphSchema, many=False, allow_none=True, required=False ) tokenizer_pattern = fields.Str(required=False) graphtransliterator_version = fields.Str(required=False) check_ambiguity = fields.Bool(required=False) # field for coverage coverage = fields.Bool(required=False) # compressed_settings = fields.Tuple(required=False) class Meta: ordered = True @pre_load def check_version_and_compression(self, data, **kwargs): """Raises error if serialized GraphTransliterator is from a later version.""" version = data.get("graphtransliterator_version") if version and version > __version__: raise IncorrectVersionException compressed_settings = data.get("compressed_settings") if compressed_settings: data = decompress_config(compressed_settings) data.update(graphtransliterator_version=version) return data @post_load def make_GraphTransliterator(self, data, **kwargs): # Convert lists to sets for key in ("tokens", "tokens_by_class"): if data.get(key): # tokens_by_class can be generated data[key] = {k: set(v) for k, v in data[key].items()} # Do not check ambiguity by default if deserializing data["check_ambiguity"] = kwargs.get("check_ambiguity", False) return GraphTransliterator(**data) @validates_schema def validate_onmatch_rules_lookup(self, data, **kwargs): """Check that if there are onmatch_rules_lookup there are onmatch_rules.""" if data.get("onmatch_rules_lookup") and not data.get("onmatch_rules"): raise ValidationError( "Contains onmatch_rules_lookup but not onmatch_rules." )
[docs] class GraphTransliterator: """ A graph-based transliteration tool that lets you convert the symbols of one language or script to those of another using rules that you define. Transliteration of tokens of an input string to an output string is configured by: a set of input token types with classes, pattern-matching rules involving sequences of tokens as well as preceding or following tokens and token classes, insertion rules between matches, and optional consolidation of whitespace. Rules are ordered by specificity. Note ---- This constructor does not validate settings and should typically not be called directly. Use :meth:`from_dict` instead. For "easy reading" support, use :meth:`from_easyreading_dict`, :meth:`from_yaml`, or :meth:`from_yaml_file`. Keyword parameters used here (``ignore_errors``, ``check_ambiguity``) can be passed from those other constructors. Parameters ---------- tokens : `dict` of {`str`: `set` of `str`} Mapping of input token types to token classes rules : `list` of `TransliterationRule` `list` of transliteration rules ordered by cost onmatch_rules : `list` of :class:`OnMatchRule`, or `None` Rules for output to be inserted between tokens of certain classes when a transliteration rule has been matched but before its production string has been added to the output whitespace: `WhitespaceRules` Rules for handling whitespace metadata: `dict` or `None` Metadata settings ignore_errors: `bool`, optional If true, transliteration errors are ignored and do not raise an exception. The default is false. check_ambiguity: `bool`, optional If true (default), transliteration rules are checked for ambiguity. :meth:`load()` and :meth:`loads` do not check ambiguity by default. onmatch_rules_lookup: `dict` of {`str`: dict of {`str`: `list` of `int`}}, optional` OnMatchRules lookup, used internally, will be generated if not present. tokens_by_class: `dict` of {`str`: `set` of `str`}, optional Tokens by class, used internally, will be generated if not present. graph: `DirectedGraph`, optional Directed graph used by Graph Transliterator, will be generated if not present. tokenizer_pattern: `str`, optional Regular expression pattern for input string tokenization, will be generated if not present. graphtransliterator_version: `str`, optional Version of graphtransliterator, added by `dump()` and `dumps()`. Example ------- .. jupyter-execute:: from graphtransliterator import GraphTransliterator, OnMatchRule, TransliterationRule, WhitespaceRules settings = {'tokens': {'a': {'vowel'}, ' ': {'wb'}}, 'onmatch_rules': [OnMatchRule(prev_classes=['vowel'], next_classes=['vowel'], production=',')], 'rules': [TransliterationRule(production='A', prev_classes=None, prev_tokens=None, tokens=['a'], next_tokens=None, next_classes=None, cost=0.5849625007211562), TransliterationRule(production=' ', prev_classes=None, prev_tokens=None, tokens=[' '], next_tokens=None, next_classes=None, cost=0.5849625007211562)], 'metadata': {'author': 'Author McAuthorson'}, 'whitespace': WhitespaceRules(default=' ', token_class='wb', consolidate=False)} gt = GraphTransliterator(**settings) gt.transliterate('a') See Also -------- from_dict : Constructor from dictionary of settings from_easyreading_dict : Constructor from dictionary in "easy reading" format from_yaml : Constructor from YAML string in "easy reading" format from_yaml_file : Constructor from YAML file in "easy reading" format""" # noqa # ---------- initialize ---------- def __init__( self, tokens, rules, whitespace, onmatch_rules=None, metadata=None, ignore_errors=False, check_ambiguity=True, onmatch_rules_lookup=None, tokens_by_class=None, graph=None, tokenizer_pattern=None, graphtransliterator_version=None, **kwargs, ): self._tokens = tokens self._rules = rules self._tokens_by_class = tokens_by_class or _tokens_by_class_of(tokens) self._check_ambiguity = check_ambiguity if check_ambiguity: check_for_ambiguity(self) self._whitespace = whitespace if onmatch_rules: self._onmatch_rules = onmatch_rules if onmatch_rules_lookup: self._onmatch_rules_lookup = onmatch_rules_lookup else: self._onmatch_rules_lookup = _onmatch_rules_lookup( tokens, onmatch_rules ) else: self._onmatch_rules = None self._onmatch_rules_lookup = None self._metadata = metadata self._ignore_errors = ignore_errors if not tokenizer_pattern: tokenizer_pattern = _tokenizer_pattern_from(list(tokens.keys())) self._tokenizer_pattern = tokenizer_pattern self._tokenizer = re.compile(tokenizer_pattern, re.S) if not graph: graph = _graph_from(rules) self._graph = graph self._rule_keys = [] # last matched rules # When, or if, necessary, add version checking here if not graphtransliterator_version: graphtransliterator_version = __version__ self._graphtransliterator_version = graphtransliterator_version # ---------- class methods ---------- # ---------- private functions ---------- def _match_constraints(self, target_edge, curr_node, token_i, tokens): """ Match edge constraints. Called on edge before a rule. `token_i` is set to location right after tokens consumed. """ constraints = target_edge.get("constraints") rules = self.rules if not constraints: return True for c_type, c_values in constraints.items(): if c_type == "prev_tokens": num_tokens = len(rules[curr_node["rule_key"]].tokens) # presume for rule (a) a, with input "aa" # ' ', a, a, ' ' start (token_i=3) # ^ # ^ -1 subtract num_tokens # ^ - len(constraint_values) start_at = token_i start_at -= num_tokens start_at -= len(c_values) if not self._match_tokens( start_at, c_values, tokens, check_prev=True, check_next=False, by_class=False, ): return False elif c_type == "next_tokens": # presume for rule a (a), with input "aa" # ' ', a, a, ' ' start (token_i=2) # ^ start_at = token_i if not self._match_tokens( start_at, c_values, tokens, check_prev=False, check_next=True, by_class=False, ): return False elif c_type == "prev_classes": num_tokens = len(rules[curr_node["rule_key"]].tokens) # presume for rule (a <class_a>) a, with input "aaa" # ' ', a, a, a, ' ' # ^ start (token_i=4) # ^ -num_tokens # ^ -len(prev_tokens) # ^ -len(prev_classes) start_at = token_i start_at -= num_tokens prev_tokens = constraints.get("prev_tokens") if prev_tokens: start_at -= len(prev_tokens) start_at -= len(c_values) if not self._match_tokens( start_at, c_values, tokens, check_prev=True, check_next=False, by_class=True, ): return False elif c_type == "next_classes": # presume for rule a (a <class_a>), with input "aaa" # ' ', a, a, a, ' ' # ^ start (token_i=2) # ^ + len of next_tokens (a) start_at = token_i next_tokens = constraints.get("next_tokens") if next_tokens: start_at += len(next_tokens) if not self._match_tokens( start_at, c_values, tokens, check_prev=False, check_next=True, by_class=True, ): return False return True def _match_tokens( self, start_i, constraint_values, tokens, check_prev=True, check_next=True, by_class=False, ): """ Match tokens at a particular index, optionally checking previous or next tokens and by token class, with boundary checks.""" if check_prev and start_i < 0: return False if check_next and start_i + len(constraint_values) > len(tokens): return False for i in range(0, len(constraint_values)): if by_class: if not constraint_values[i] in self._tokens[tokens[start_i + i]]: return False elif tokens[start_i + i] != constraint_values[i]: return False return True # ---------- properties ---------- @property def graph(self): """`DirectedGraph`: Graph used in transliteration.""" return self._graph @property def graphtransliterator_version(self): """`str`: Graph Transliterator version.""" return self._graphtransliterator_version @property def ignore_errors(self): """`bool`: Ignore transliteration errors setting.""" return self._ignore_errors @ignore_errors.setter def ignore_errors(self, value): self._ignore_errors = value @property def last_input_tokens(self): """ `list` of `str`: Last tokenization of the input string, with whitespace at start and end.""" return self._input_tokens @property def last_matched_rule_tokens(self): """`list` of `list` of `str`: Last matched tokens for each rule.""" return [self._rules[_].tokens for _ in self._rule_keys] @property def last_matched_rules(self): """ `list` of `TransliterationRule`: Last transliteration rules matched. """ return [self._rules[_] for _ in self._rule_keys] @property def metadata(self): """ `dict`: Metadata of transliterator """ return self._metadata @property def onmatch_rules_lookup(self): """ `dict`: On Match Rules lookup """ return self._onmatch_rules_lookup @property def tokenizer_pattern(self): """ `str`: Tokenizer pattern from transliterator """ return self._tokenizer_pattern @property def tokens_by_class(self): """ `dict` of {`str`: `list` of `str`}: Tokenizer pattern from transliterator """ return self._tokens_by_class @property def onmatch_rules(self): """`list` of :class:`OnMatchRules`: Rules for productions between matches.""" return self._onmatch_rules @property def productions(self): """ `list` of `str`: List of productions of each transliteration rule. """ return [_.production for _ in self.rules] @property def rules(self): """ `list` of `TransliterationRule`: Transliteration rules sorted by cost. """ return self._rules @property def tokens(self): """ `dict` of {`str`:`set` of `str`}: Mappings of tokens to their classes. """ return self._tokens @property def whitespace(self): """WhiteSpaceRules: Whitespace rules.""" return self._whitespace # ---------- public functions ----------
[docs] def dump(self, compression_level=0): """ Dump configuration of Graph Transliterator to Python data types. Compression is turned off by default. Parameters ---------- compression_level: `int` A value in 0 (default, no compression), 1 (compression including graph), and 2 (compressiong without graph) Returns ------- OrderedDict GraphTransliterator configuration as a dictionary with keys: ``"tokens"`` Mappings of tokens to their classes (`OrderedDict` of {str: `list` of `str`}) ``"rules"`` Transliteration rules in direct format (`list` of `dict` of {`str`: `str`}) ``"whitespace"`` Whitespace settings (`dict` of {`str`: `str`}) ``"onmatch_rules"`` On match rules (`list` of `OrderedDict`) ``"metadata"`` Dictionary of metadata (`dict`) ``"ignore_errors"`` Ignore errors in transliteration (`bool`) ``"onmatch_rules_lookup"`` Dictionary keyed by current token to previous token containing a list of indexes of applicable :class:`OnmatchRule` to try (`dict` of {`str`: `dict` of {`str`: `list` of `int`}}) ``"tokens_by_class"`` Tokens keyed by token class, used internally (`dict` of {`str`: `list` of str}) ``"graph"`` Serialization of `DirectedGraph` (`dict`) ``"tokenizer_pattern"`` Regular expression for tokenizing (`str`) ``"graphtransliterator_version"`` Module version of `graphtransliterator` (`str`) Example ------- .. jupyter-execute:: yaml_ = ''' tokens: a: [vowel] ' ': [wb] rules: a: A ' ': ' ' whitespace: default: " " consolidate: false token_class: wb onmatch_rules: - <vowel> + <vowel>: ',' # add a comma between vowels metadata: author: "Author McAuthorson" ''' gt = GraphTransliterator.from_yaml(yaml_) gt.dump() See Also -------- dumps : Dump Graph Transliterator configuration to JSON string load : Load Graph Transliteration from configuration in Python data types loads : Load Graph Transliteration from configuration as a JSON string """ # noqa if compression_level == 0: return GraphTransliteratorSchema().dump(self) elif compression_level not in range(1, HIGHEST_COMPRESSION_LEVEL + 1): raise ValueError( f"Compression level must be between 0 and {HIGHEST_COMPRESSION_LEVEL}" ) return { "graphtransliterator_version": __version__, "compressed_settings": compress_config( GraphTransliteratorSchema().dump(self), compression_level=compression_level, ), }
[docs] def dumps(self, compression_level=2): """ Parameters ---------- compression_level: `int` A value in 0 (no compression), 1 (compression including graph), and 2 (default, compression without graph) separators: `tuple` of `str` Separators used by json.dumps(), default is compact Dump settings of Graph Transliterator to Javascript Object Notation (JSON). Compression is turned on by default. Returns ------- `str` JSON string Examples -------- .. jupyter-execute:: yaml_ = ''' tokens: a: [vowel] ' ': [wb] rules: a: A ' ': ' ' whitespace: default: " " consolidate: false token_class: wb onmatch_rules: - <vowel> + <vowel>: ',' # add a comma between vowels metadata: author: "Author McAuthorson" ''' gt = GraphTransliterator.from_yaml(yaml_) gt.dumps() See Also -------- dump : Dump Graph Transliterator configuration to Python data types load : Load Graph Transliteration from configuration in Python data types loads : Load Graph Transliteration from configuration as a JSON string """ # noqa if compression_level == 0: return GraphTransliteratorSchema().dumps(self) _config = self.dump(compression_level=compression_level) return json.dumps(_config, separators=(",", ":"))
[docs] def match_at(self, token_i, tokens, match_all=False): """ Match best (least costly) transliteration rule at a given index in the input tokens and return the index to that rule. Optionally, return all rules that match. Parameters ---------- token_i : `int` Location in `tokens` at which to begin tokens : `list` of `str` List of tokens match_all : `bool`, optional If true, return the index of all rules matching at the given index. The default is false. Returns ------- `int`, `None`, or `list` of `int` Index of matching transliteration rule in :attr:`GraphTransliterator.rules` or None. Returns a `list` of `int` or an empty `list` if ``match_all`` is true. Note ---- Expects whitespaces token at beginning and end of `tokens`. Examples -------- .. jupyter-execute:: gt = GraphTransliterator.from_yaml(''' tokens: a: [] a a: [] ' ': [wb] rules: a: <A> a a: <AA> whitespace: default: ' ' consolidate: True token_class: wb ''') tokens = gt.tokenize("aa") tokens # whitespace added to ends .. jupyter-execute:: gt.match_at(1, tokens) # returns index to rule .. jupyter-execute:: gt.rules[gt.match_at(1, tokens)] # actual rule .. jupyter-execute:: gt.match_at(1, tokens, match_all=True) # index to rules, with match_all .. jupyter-execute:: [gt.rules[_] for _ in gt.match_at(1, tokens, match_all=True)] """ # noqa graph = self._graph graph_node = graph.node graph_edge = graph.edge if match_all: matches = [] stack = deque() def _append_children(node_key, token_i): children = None ordered_children = graph_node[node_key].get("ordered_children") if ordered_children: children = ordered_children.get(tokens[token_i]) if children: # reordered high to low for stack: for child_key in reversed(children): stack.appendleft((child_key, node_key, token_i)) else: rules_keys = ordered_children.get("__rules__") # leafs if rules_keys: # There may be more than one rule, as certain rules have # constraints on them. # Reordered so higher cost go on stack last. for rule_key in reversed(rules_keys): stack.appendleft((rule_key, node_key, token_i)) _append_children(0, token_i) # Append all children of root node while stack: # LIFO node_key, parent_key, token_i = stack.popleft() curr_node = graph_node[node_key] # Constraints are only on preceding edge if it is accepting # But edge is accessed regardless to test coverage incident_edge = graph_edge[parent_key][node_key] # Pass edge, curr_node, token index, and tokens to check constraints if curr_node.get("accepting") and self._match_constraints( incident_edge, curr_node, token_i, tokens ): if match_all: matches.append(curr_node["rule_key"]) continue else: return curr_node["rule_key"] else: if token_i < len(tokens) - 1: token_i += 1 _append_children(node_key, token_i) if match_all: return matches
[docs] def pruned_of(self, productions): """ Remove transliteration rules with specific output productions. Parameters ---------- productions : `str`, or `list` of `str` list of productions to remove Returns ------- graphtransliterator.GraphTransliterator Graph transliterator pruned of certain productions. Note ---- Uses original initialization parameters to construct a new :class:`GraphTransliterator`. Examples -------- .. jupyter-execute:: gt = GraphTransliterator.from_yaml(''' tokens: a: [] a a: [] ' ': [wb] rules: a: <A> a a: <AA> whitespace: default: ' ' consolidate: True token_class: wb ''') gt.rules .. jupyter-execute:: gt.pruned_of('<AA>').rules .. jupyter-execute:: gt.pruned_of(['<A>', '<AA>']).rules """ # noqa pruned_rules = [_ for _ in self._rules if _.production not in productions] return GraphTransliterator( self._tokens, pruned_rules, self._whitespace, onmatch_rules=self._onmatch_rules, metadata=self._metadata, ignore_errors=self._ignore_errors, check_ambiguity=self._check_ambiguity, )
[docs] def tokenize(self, input): """ Tokenizes an input string. Adds initial and trailing whitespace, which can be consolidated. Parameters ---------- input : str String to tokenize Returns ------- `list` of `str` List of tokens, with default whitespace token at beginning and end. Raises ------ ValueError Unrecognizable input, such as a character that is not in a token Examples -------- .. jupyter-execute:: tokens = {'ab': ['class_ab'], ' ': ['wb']} whitespace = {'default': ' ', 'token_class': 'wb', 'consolidate': True} rules = {'ab': 'AB', ' ': '_'} settings = {'tokens': tokens, 'rules': rules, 'whitespace': whitespace} gt = GraphTransliterator.from_easyreading_dict(settings) gt.tokenize('ab ') """ def is_whitespace(token): """Check if token is whitespace.""" return self.whitespace.token_class in self.tokens[token] # start with a whitespace token tokens = [self.whitespace.default] prev_whitespace = True match_at = 0 while match_at < len(input): match = self._tokenizer.match(input, match_at) if match: match_at = match.end() # advance match_at token = match.group(0) # Could save match loc here: # matched_at = match.span(0)[0] if is_whitespace(token): if prev_whitespace and self.whitespace.consolidate: continue else: prev_whitespace = True else: prev_whitespace = False tokens.append(token) else: logger.warning( "Unrecognizable token %s at pos %s of %s" % (input[match_at], match_at, input) ) if not self.ignore_errors: raise UnrecognizableInputTokenException else: match_at += 1 if self.whitespace.consolidate: while len(tokens) > 1 and is_whitespace(tokens[-1]): tokens.pop() tokens.append(self.whitespace.default) return tokens
[docs] def transliterate(self, input): """ Transliterate an input string into an output string. Parameters ---------- input : `str` Input string to transliterate Returns ------- `str` Transliteration output string Raises ------ ValueError Cannot parse input Note ---- Whitespace will be temporarily appended to start and end of input string. Example ------- .. jupyter-execute:: GraphTransliterator.from_yaml( ''' tokens: a: [] ' ': [wb] rules: a: A ' ': '_' whitespace: default: ' ' consolidate: True token_class: wb ''').transliterate("a a") """ tokens = self.tokenize(input) # Adds initial+final whitespace self._input_tokens = tokens # Tokens are saved here self._rule_keys = [] # Matched ule keys are saved here output = "" token_i = 1 # Adjust for initial whitespace while token_i < len(tokens) - 1: # Adjust for final whitespace rule_key = self.match_at(token_i, tokens) if rule_key is None: logger.warning( "No matching transliteration rule at token pos %s of %s" % (token_i, tokens) ) # No parsing rule was found at this location if self.ignore_errors: # Move along if ignoring errors token_i += 1 continue else: raise NoMatchingTransliterationRuleException self._rule_keys.append(rule_key) rule = self.rules[rule_key] tokens_matched = rule.tokens if self._onmatch_rules: curr_match_rules = None prev_t = tokens[token_i - 1] curr_t = tokens[token_i] curr_t_rules = self._onmatch_rules_lookup.get(curr_t) if curr_t_rules: curr_match_rules = curr_t_rules.get(prev_t) if curr_match_rules: for onmatch_i in curr_match_rules: onmatch = self._onmatch_rules[onmatch_i] # <class_a> <class_a> + <class_b> # a a b # ^ # ^ - len(onmatch.prev_rules) if self._match_tokens( token_i - len(onmatch.prev_classes), onmatch.prev_classes, # Checks last value tokens, check_prev=True, check_next=False, by_class=True, ) and self._match_tokens( token_i, onmatch.next_classes, # Checks first value tokens, check_prev=False, check_next=True, by_class=True, ): output += onmatch.production break # Only match best onmatch rule output += rule.production token_i += len(tokens_matched) return output
# ---------- static methods ----------
[docs] @staticmethod def from_dict(dict_settings, **kwargs): """Generate GraphTransliterator from `dict` settings. Parameters ---------- dict_settings : `dict` Dictionary of settings Returns ------- GraphTransliterator Graph transliterator """ settings = SettingsSchema().load(dict_settings) args = [settings["tokens"], settings["rules"], settings["whitespace"]] kwargs = { "onmatch_rules": settings.get("onmatch_rules"), "metadata": settings.get("metadata"), "tokens_by_class": settings.get("tokens_by_class"), # will be generated "graph": settings.get("graph"), # will be generated "tokenizer_pattern": settings.get("tokenizer_pattern"), # will be generated "ignore_errors": kwargs.get("ignore_errors", False), "check_ambiguity": kwargs.get("check_ambiguity", True), } return GraphTransliterator(*args, **kwargs)
[docs] @staticmethod def from_easyreading_dict(easyreading_settings, **kwargs): """ Constructs `GraphTransliterator` from a dictionary of settings in "easy reading" format, i.e. the loaded contents of a YAML string. Parameters ---------- easyreading_settings : `dict` Settings dictionary in easy reading format with keys: ``"tokens"`` Mappings of tokens to their classes (`dict` of {str: `list` of `str`}) ``"rules"`` Transliteration rules in "easy reading" format (`list` of `dict` of {`str`: `str`}) ``"onmatch_rules"`` On match rules in "easy reading" format (`dict` of {`str`: `str`}, optional) ``"whitespace"`` Whitespace definitions, including default whitespace token, class of whitespace tokens, and whether or not to consolidate (`dict` of {'default': `str`, 'token_class': `str`, consolidate: `bool`}, optional) ``"metadata"`` Dictionary of metadata (`dict`, optional) Returns ------- GraphTransliterator Graph Transliterator Note ---- Called by :meth:`from_yaml`. Example ------- .. jupyter-execute:: tokens = { 'ab': ['class_ab'], ' ': ['wb'] } whitespace = { 'default': ' ', 'token_class': 'wb', 'consolidate': True } onmatch_rules = [ {'<class_ab> + <class_ab>': ','} ] rules = {'ab': 'AB', ' ': '_'} settings = {'tokens': tokens, 'rules': rules, 'whitespace': whitespace, 'onmatch_rules': onmatch_rules} gt = GraphTransliterator.from_easyreading_dict(settings) gt.transliterate("ab abab") See Also -------- from_yaml : Constructor from YAML string in "easy reading" format from_yaml_file : Constructor from YAML file in "easy reading" format """ # Validate easyreading settings _ = EasyReadingSettingsSchema().load(easyreading_settings) # Convert those to regular settings _ = _process_easyreading_settings(_) # Validation of regular settings is done in from_dict return GraphTransliterator.from_dict(_, **kwargs)
[docs] @staticmethod def from_yaml(yaml_str, charnames_escaped=True, **kwargs): """ Construct GraphTransliterator from a YAML str. Parameters ---------- yaml_str : str YAML mappings of tokens, rules, and (optionally) onmatch_rules charnames_escaped : boolean Unescape Unicode during YAML read (default True) Note ---- Called by :meth:`from_yaml_file` and calls :meth:`from_easyreading_dict`. Example ------- .. jupyter-execute:: yaml_ = ''' tokens: a: [class1] ' ': [wb] rules: a: A ' ': ' ' whitespace: default: ' ' consolidate: True token_class: wb onmatch_rules: - <class1> + <class1>: "+" ''' gt = GraphTransliterator.from_yaml(yaml_) gt.transliterate("a aa") See Also -------- from_easyreading_dict : Constructor from dictionary in "easy reading" format from_yaml : Constructor from YAML string in "easy reading" format from_yaml_file : Constructor from YAML file in "easy reading" format """ if charnames_escaped: yaml_str = _unescape_charnames(yaml_str) settings = yaml.safe_load(yaml_str) return GraphTransliterator.from_easyreading_dict(settings, **kwargs)
[docs] @staticmethod def from_yaml_file(yaml_filename, **kwargs): """ Construct GraphTransliterator from YAML file. Parameters ---------- yaml_filename : str Name of YAML file, containing tokens, rules, and (optionally) onmatch_rules Note ---- Calls :meth:`from_yaml`. See Also -------- from_yaml : Constructor from YAML string in "easy reading" format from_easyreading_dict : Constructor from dictionary in "easy reading" format """ with open(yaml_filename, "r") as f: yaml_string = f.read() return GraphTransliterator.from_yaml(yaml_string, **kwargs)
[docs] @staticmethod def load(settings, **kwargs): """Create GraphTransliterator from settings as Python data types. Parameters ---------- settings GraphTransliterator configuration as a dictionary with keys: ``"tokens"`` Mappings of tokens to their classes (`dict` of {str: `list` of `str`}) ``"rules"`` Transliteration rules in direct format (`list` of `OrderedDict` of {`str`: `str`}) ``"whitespace"`` Whitespace settings (`dict` of {`str`: `str`}) ``"onmatch_rules"`` On match rules (`list` of `OrderedDict`, optional) ``"metadata"`` Dictionary of metadata (`dict`, optional) ``"ignore_errors"`` Ignore errors. (`bool`, optional) ``"onmatch_rules_lookup"`` Dictionary keyed by current token to previous token containing a list of indexes of applicable :class:`OnmatchRule` to try (`dict` of {`str`: `dict` of {`str`: `list` of `int`}}, optional) ``tokens_by_class`` Tokens keyed by token class, used internally (`dict` of {`str`: `list` of str}, optional) ``graph`` Serialization of `DirectedGraph` (`dict`, optional) ``"tokenizer_pattern"`` Regular expression for tokenizing (`str`, optional) ``"graphtransliterator_version"`` Module version of `graphtransliterator` (`str`, optional) Returns ------- GraphTransliterator Graph Transliterator Example ------- .. jupyter-execute:: from collections import OrderedDict settings = \ {'tokens': {'a': ['vowel'], ' ': ['wb']}, 'rules': [OrderedDict([('production', 'A'), # Can be compacted, removing None values # ('prev_tokens', None), ('tokens', ['a']), ('next_classes', None), ('next_tokens', None), ('cost', 0.5849625007211562)]), OrderedDict([('production', ' '), ('prev_classes', None), ('prev_tokens', None), ('tokens', [' ']), ('next_classes', None), ('next_tokens', None), ('cost', 0.5849625007211562)])], 'whitespace': {'default': ' ', 'token_class': 'wb', 'consolidate': False}, 'onmatch_rules': [OrderedDict([('prev_classes', ['vowel']), ('next_classes', ['vowel']), ('production', ',')])], 'metadata': {'author': 'Author McAuthorson'}, 'onmatch_rules_lookup': {'a': {'a': [0]}}, 'tokens_by_class': {'vowel': ['a'], 'wb': [' ']}, 'graph': {'edge': {0: {1: {'token': 'a', 'cost': 0.5849625007211562}, 3: {'token': ' ', 'cost': 0.5849625007211562}}, 1: {2: {'cost': 0.5849625007211562}}, 3: {4: {'cost': 0.5849625007211562}}}, 'node': [{'type': 'Start', 'ordered_children': {'a': [1], ' ': [3]}}, {'type': 'token', 'token': 'a', 'ordered_children': {'__rules__': [2]}}, {'type': 'rule', 'rule_key': 0, 'accepting': True, 'ordered_children': {}}, {'type': 'token', 'token': ' ', 'ordered_children': {'__rules__': [4]}}, {'type': 'rule', 'rule_key': 1, 'accepting': True, 'ordered_children': {}}], 'edge_list': [(0, 1), (1, 2), (0, 3), (3, 4)]}, 'tokenizer_pattern': '(a|\\ )', 'graphtransliterator_version': '0.3.3'} gt = GraphTransliterator.load(settings) gt.transliterate('aa') .. jupyter-execute:: # can be compacted settings.pop('onmatch_rules_lookup') GraphTransliterator.load(settings).transliterate('aa') See Also -------- dump : Dump Graph Transliterator configuration to Python data types dumps : Dump Graph Transliterator configuration to JSON string loads : Load Graph Transliteration from configuration as a JSON string """ # noqa # combine kwargs with settings return GraphTransliteratorSchema().load(dict(settings, **kwargs))
[docs] @staticmethod def loads(settings, **kwargs): """Create GraphTransliterator from JavaScript Object Notation (JSON) string. Parameters ---------- settings JSON settings for GraphTransliterator Returns ------- GraphTransliterator Graph Transliterator Example ------- .. jupyter-execute:: JSON_settings = '''{"tokens": {"a": ["vowel"], " ": ["wb"]}, "rules": [{"production": "A", "prev_classes": null, "prev_tokens": null, "tokens": ["a"], "next_classes": null, "next_tokens": null, "cost": 0.5849625007211562}, {"production": " ", "prev_classes": null, "prev_tokens": null, "tokens": [" "], "next_classes": null, "next_tokens": null, "cost": 0.5849625007211562}], "whitespace": {"default": " ", "token_class": "wb", "consolidate": false}, "onmatch_rules": [{"prev_classes": ["vowel"], "next_classes": ["vowel"], "production": ","}], "metadata": {"author": "Author McAuthorson"}, "ignore_errors": false, "onmatch_rules_lookup": {"a": {"a": [0]}}, "tokens_by_class": {"vowel": ["a"], "wb": [" "]}, "graph": {"node": [{"type": "Start", "ordered_children": {"a": [1], " ": [3]}}, {"type": "token", "token": "a", "ordered_children": {"__rules__": [2]}}, {"type": "rule", "rule_key": 0, "accepting": true, "ordered_children": {}}, {"type": "token", "token": " ", "ordered_children": {"__rules__": [4]}}, {"type": "rule", "rule_key": 1, "accepting": true, "ordered_children": {}}], "edge": {"0": {"1": {"token": "a", "cost": 0.5849625007211562}, "3": {"token": " ", "cost": 0.5849625007211562}}, "1": {"2": {"cost": 0.5849625007211562}}, "3": {"4": {"cost": 0.5849625007211562}}}, "edge_list": [[0, 1], [1, 2], [0, 3], [3, 4]]}, "tokenizer_pattern": "(a| )", "graphtransliterator_version": "1.2.2"}''' gt = GraphTransliterator.loads(JSON_settings) gt.transliterate('a') See Also -------- dump : Dump Graph Transliterator configuration to Python data types dumps : Dump Graph Transliterator configuration to JSON string load : Load Graph Transliteration from configuration in Python data types """ # noqa # combine kwargs with settings _settings = dict(json.loads(settings), **kwargs) return GraphTransliteratorSchema().load(_settings)
[docs] class CoverageTransliterator(GraphTransliterator): """Subclass of GraphTransliterator that logs visits to graph and on_match rules. Used to confirm that tests cover the entire graph and onmatch_rules.""" def __init__(self, *args, **kwargs): # Initialize from GraphTransliterator GraphTransliterator.__init__(self, *args, **kwargs) # Convert _graph and _onmatch_rules to visit-tracking objects self._graph = VisitLoggingDirectedGraph(self._graph) self._onmatch_rules = VisitLoggingList(self._onmatch_rules)
[docs] def clear_visited(self): """Clear visited flags from graph and onmatch_rules.""" self._graph.clear_visited() if self._onmatch_rules: self._onmatch_rules.clear_visited()
[docs] def check_onmatchrules_coverage(self, raise_exception=True): """Check coverage of onmatch rules.""" errors = [] onmatch_rules = self._onmatch_rules for i, onmatch_rule in enumerate(onmatch_rules.data): # data to avoid visited if i not in onmatch_rules.visited: logger.warning( "On Match Rule {} [{}] has not been visited.".format( i, onmatch_rule ) ) errors.append(i) if errors and raise_exception: error_msg = "Missed OnMatchRules: " + ",".join([str(i) for i in errors]) raise IncompleteOnMatchRulesCoverageException(error_msg) return not errors
[docs] def check_coverage(self, raise_exception=True): """Check coverage of graph and onmatch rules. First checks graph coverage, then checks onmatch rules.""" return self._graph.check_coverage( raise_exception=raise_exception ) and self.check_onmatchrules_coverage(raise_exception=raise_exception)