# -*- coding: utf-8 -*-
"""
GraphTransliterator core classes.
"""
from .ambiguity import check_for_ambiguity
from .compression import compress_config, decompress_config
from .exceptions import (
IncompleteOnMatchRulesCoverageException,
IncorrectVersionException,
NoMatchingTransliterationRuleException,
UnrecognizableInputTokenException,
)
from .graphs import VisitLoggingDirectedGraph, VisitLoggingList
from .initialize import (
_graph_from,
_onmatch_rules_lookup,
_tokenizer_pattern_from,
_tokens_by_class_of,
_unescape_charnames,
)
from .process import _process_easyreading_settings
from .schemas import (
DirectedGraphSchema,
EasyReadingSettingsSchema,
OnMatchRuleSchema,
SettingsSchema,
TransliterationRuleSchema,
WhitespaceSettingsSchema,
)
from collections import deque
from graphtransliterator import __version__ as __version__
import json
import logging
from marshmallow import (
fields,
pre_load,
post_load,
Schema,
validates_schema,
ValidationError,
)
import re
import yaml
logger = logging.getLogger("graphtransliterator")
DEFAULT_COMPRESSION_LEVEL = 2
HIGHEST_COMPRESSION_LEVEL = 2
[docs]
class GraphTransliteratorSchema(Schema):
"""Schema for Graph Transliterator."""
tokens = fields.Dict(
keys=fields.Str(), values=fields.List(fields.Str()), required=True
)
rules = fields.Nested(TransliterationRuleSchema, many=True, required=True)
whitespace = fields.Nested(WhitespaceSettingsSchema, many=False, required=True)
onmatch_rules = fields.Nested(
OnMatchRuleSchema, many=True, required=False, allow_none=True
)
metadata = fields.Dict(
keys=fields.Str(), required=False # No restriction on values
)
ignore_errors = fields.Bool(required=False)
onmatch_rules_lookup = fields.Dict(required=False, allow_none=True)
tokens_by_class = fields.Dict(
keys=fields.Str(), values=fields.List(fields.Str), required=False
)
graph = fields.Nested(
DirectedGraphSchema, many=False, allow_none=True, required=False
)
tokenizer_pattern = fields.Str(required=False)
graphtransliterator_version = fields.Str(required=False)
check_ambiguity = fields.Bool(required=False)
# field for coverage
coverage = fields.Bool(required=False)
# compressed_settings = fields.Tuple(required=False)
class Meta:
ordered = True
@pre_load
def check_version_and_compression(self, data, **kwargs):
"""Raises error if serialized GraphTransliterator is from a later version."""
version = data.get("graphtransliterator_version")
if version and version > __version__:
raise IncorrectVersionException
compressed_settings = data.get("compressed_settings")
if compressed_settings:
data = decompress_config(compressed_settings)
data.update(graphtransliterator_version=version)
return data
@post_load
def make_GraphTransliterator(self, data, **kwargs):
# Convert lists to sets
for key in ("tokens", "tokens_by_class"):
if data.get(key): # tokens_by_class can be generated
data[key] = {k: set(v) for k, v in data[key].items()}
# Do not check ambiguity by default if deserializing
data["check_ambiguity"] = kwargs.get("check_ambiguity", False)
return GraphTransliterator(**data)
@validates_schema
def validate_onmatch_rules_lookup(self, data, **kwargs):
"""Check that if there are onmatch_rules_lookup there are onmatch_rules."""
if data.get("onmatch_rules_lookup") and not data.get("onmatch_rules"):
raise ValidationError(
"Contains onmatch_rules_lookup but not onmatch_rules."
)
[docs]
class GraphTransliterator:
"""
A graph-based transliteration tool that lets you convert the symbols
of one language or script to those of another using rules that you define.
Transliteration of tokens of an input string to an output string is
configured by: a set of input token types with classes, pattern-matching rules
involving sequences of tokens as well as preceding or following tokens and
token classes, insertion rules between matches, and optional consolidation
of whitespace. Rules are ordered by specificity.
Note
----
This constructor does not validate settings and should typically not be called
directly. Use :meth:`from_dict` instead. For "easy reading" support, use
:meth:`from_easyreading_dict`, :meth:`from_yaml`, or :meth:`from_yaml_file`.
Keyword parameters used here (``ignore_errors``, ``check_ambiguity``) can be passed
from those other constructors.
Parameters
----------
tokens : `dict` of {`str`: `set` of `str`}
Mapping of input token types to token classes
rules : `list` of `TransliterationRule`
`list` of transliteration rules ordered by cost
onmatch_rules : `list` of :class:`OnMatchRule`, or `None`
Rules for output to be inserted between tokens
of certain classes when a transliteration rule has been matched
but before its production string has been added to the output
whitespace: `WhitespaceRules`
Rules for handling whitespace
metadata: `dict` or `None`
Metadata settings
ignore_errors: `bool`, optional
If true, transliteration errors are ignored and do not raise an
exception. The default is false.
check_ambiguity: `bool`, optional
If true (default), transliteration rules are checked for ambiguity. :meth:`load()`
and :meth:`loads` do not check ambiguity by default.
onmatch_rules_lookup: `dict` of {`str`: dict of {`str`: `list` of `int`}}, optional`
OnMatchRules lookup, used internally, will be generated if not present.
tokens_by_class: `dict` of {`str`: `set` of `str`}, optional
Tokens by class, used internally, will be generated if not present.
graph: `DirectedGraph`, optional
Directed graph used by Graph Transliterator, will be generated if not present.
tokenizer_pattern: `str`, optional
Regular expression pattern for input string tokenization, will be generated if
not present.
graphtransliterator_version: `str`, optional
Version of graphtransliterator, added by `dump()` and `dumps()`.
Example
-------
.. jupyter-execute::
from graphtransliterator import GraphTransliterator, OnMatchRule, TransliterationRule, WhitespaceRules
settings = {'tokens': {'a': {'vowel'}, ' ': {'wb'}}, 'onmatch_rules': [OnMatchRule(prev_classes=['vowel'], next_classes=['vowel'], production=',')], 'rules': [TransliterationRule(production='A', prev_classes=None, prev_tokens=None, tokens=['a'], next_tokens=None, next_classes=None, cost=0.5849625007211562), TransliterationRule(production=' ', prev_classes=None, prev_tokens=None, tokens=[' '], next_tokens=None, next_classes=None, cost=0.5849625007211562)], 'metadata': {'author': 'Author McAuthorson'}, 'whitespace': WhitespaceRules(default=' ', token_class='wb', consolidate=False)}
gt = GraphTransliterator(**settings)
gt.transliterate('a')
See Also
--------
from_dict : Constructor from dictionary of settings
from_easyreading_dict : Constructor from dictionary in "easy reading" format
from_yaml : Constructor from YAML string in "easy reading" format
from_yaml_file : Constructor from YAML file in "easy reading" format""" # noqa
# ---------- initialize ----------
def __init__(
self,
tokens,
rules,
whitespace,
onmatch_rules=None,
metadata=None,
ignore_errors=False,
check_ambiguity=True,
onmatch_rules_lookup=None,
tokens_by_class=None,
graph=None,
tokenizer_pattern=None,
graphtransliterator_version=None,
**kwargs,
):
self._tokens = tokens
self._rules = rules
self._tokens_by_class = tokens_by_class or _tokens_by_class_of(tokens)
self._check_ambiguity = check_ambiguity
if check_ambiguity:
check_for_ambiguity(self)
self._whitespace = whitespace
if onmatch_rules:
self._onmatch_rules = onmatch_rules
if onmatch_rules_lookup:
self._onmatch_rules_lookup = onmatch_rules_lookup
else:
self._onmatch_rules_lookup = _onmatch_rules_lookup(
tokens, onmatch_rules
)
else:
self._onmatch_rules = None
self._onmatch_rules_lookup = None
self._metadata = metadata
self._ignore_errors = ignore_errors
if not tokenizer_pattern:
tokenizer_pattern = _tokenizer_pattern_from(list(tokens.keys()))
self._tokenizer_pattern = tokenizer_pattern
self._tokenizer = re.compile(tokenizer_pattern, re.S)
if not graph:
graph = _graph_from(rules)
self._graph = graph
self._rule_keys = [] # last matched rules
# When, or if, necessary, add version checking here
if not graphtransliterator_version:
graphtransliterator_version = __version__
self._graphtransliterator_version = graphtransliterator_version
# ---------- class methods ----------
# ---------- private functions ----------
def _match_constraints(self, target_edge, curr_node, token_i, tokens):
"""
Match edge constraints.
Called on edge before a rule. `token_i` is set to location right
after tokens consumed.
"""
constraints = target_edge.get("constraints")
rules = self.rules
if not constraints:
return True
for c_type, c_values in constraints.items():
if c_type == "prev_tokens":
num_tokens = len(rules[curr_node["rule_key"]].tokens)
# presume for rule (a) a, with input "aa"
# ' ', a, a, ' ' start (token_i=3)
# ^
# ^ -1 subtract num_tokens
# ^ - len(constraint_values)
start_at = token_i
start_at -= num_tokens
start_at -= len(c_values)
if not self._match_tokens(
start_at,
c_values,
tokens,
check_prev=True,
check_next=False,
by_class=False,
):
return False
elif c_type == "next_tokens":
# presume for rule a (a), with input "aa"
# ' ', a, a, ' ' start (token_i=2)
# ^
start_at = token_i
if not self._match_tokens(
start_at,
c_values,
tokens,
check_prev=False,
check_next=True,
by_class=False,
):
return False
elif c_type == "prev_classes":
num_tokens = len(rules[curr_node["rule_key"]].tokens)
# presume for rule (a <class_a>) a, with input "aaa"
# ' ', a, a, a, ' '
# ^ start (token_i=4)
# ^ -num_tokens
# ^ -len(prev_tokens)
# ^ -len(prev_classes)
start_at = token_i
start_at -= num_tokens
prev_tokens = constraints.get("prev_tokens")
if prev_tokens:
start_at -= len(prev_tokens)
start_at -= len(c_values)
if not self._match_tokens(
start_at,
c_values,
tokens,
check_prev=True,
check_next=False,
by_class=True,
):
return False
elif c_type == "next_classes":
# presume for rule a (a <class_a>), with input "aaa"
# ' ', a, a, a, ' '
# ^ start (token_i=2)
# ^ + len of next_tokens (a)
start_at = token_i
next_tokens = constraints.get("next_tokens")
if next_tokens:
start_at += len(next_tokens)
if not self._match_tokens(
start_at,
c_values,
tokens,
check_prev=False,
check_next=True,
by_class=True,
):
return False
return True
def _match_tokens(
self,
start_i,
constraint_values,
tokens,
check_prev=True,
check_next=True,
by_class=False,
):
"""
Match tokens at a particular index, optionally checking previous or next tokens
and by token class, with boundary checks."""
if check_prev and start_i < 0:
return False
if check_next and start_i + len(constraint_values) > len(tokens):
return False
for i in range(0, len(constraint_values)):
if by_class:
if not constraint_values[i] in self._tokens[tokens[start_i + i]]:
return False
elif tokens[start_i + i] != constraint_values[i]:
return False
return True
# ---------- properties ----------
@property
def graph(self):
"""`DirectedGraph`: Graph used in transliteration."""
return self._graph
@property
def graphtransliterator_version(self):
"""`str`: Graph Transliterator version."""
return self._graphtransliterator_version
@property
def ignore_errors(self):
"""`bool`: Ignore transliteration errors setting."""
return self._ignore_errors
@ignore_errors.setter
def ignore_errors(self, value):
self._ignore_errors = value
@property
def last_input_tokens(self):
"""
`list` of `str`: Last tokenization of the input string, with whitespace
at start and end."""
return self._input_tokens
@property
def last_matched_rule_tokens(self):
"""`list` of `list` of `str`: Last matched tokens for each rule."""
return [self._rules[_].tokens for _ in self._rule_keys]
@property
def last_matched_rules(self):
"""
`list` of `TransliterationRule`: Last transliteration rules matched.
"""
return [self._rules[_] for _ in self._rule_keys]
@property
def metadata(self):
"""
`dict`: Metadata of transliterator
"""
return self._metadata
@property
def onmatch_rules_lookup(self):
"""
`dict`: On Match Rules lookup
"""
return self._onmatch_rules_lookup
@property
def tokenizer_pattern(self):
"""
`str`: Tokenizer pattern from transliterator
"""
return self._tokenizer_pattern
@property
def tokens_by_class(self):
"""
`dict` of {`str`: `list` of `str`}: Tokenizer pattern from transliterator
"""
return self._tokens_by_class
@property
def onmatch_rules(self):
"""`list` of :class:`OnMatchRules`: Rules for productions between matches."""
return self._onmatch_rules
@property
def productions(self):
"""
`list` of `str`: List of productions of each transliteration rule.
"""
return [_.production for _ in self.rules]
@property
def rules(self):
"""
`list` of `TransliterationRule`: Transliteration rules sorted by cost.
"""
return self._rules
@property
def tokens(self):
"""
`dict` of {`str`:`set` of `str`}: Mappings of tokens to their classes.
"""
return self._tokens
@property
def whitespace(self):
"""WhiteSpaceRules: Whitespace rules."""
return self._whitespace
# ---------- public functions ----------
[docs]
def dump(self, compression_level=0):
"""
Dump configuration of Graph Transliterator to Python data types.
Compression is turned off by default.
Parameters
----------
compression_level: `int`
A value in 0 (default, no compression), 1 (compression including graph),
and 2 (compressiong without graph)
Returns
-------
OrderedDict
GraphTransliterator configuration as a dictionary with keys:
``"tokens"``
Mappings of tokens to their classes
(`OrderedDict` of {str: `list` of `str`})
``"rules"``
Transliteration rules in direct format
(`list` of `dict` of {`str`: `str`})
``"whitespace"``
Whitespace settings
(`dict` of {`str`: `str`})
``"onmatch_rules"``
On match rules
(`list` of `OrderedDict`)
``"metadata"``
Dictionary of metadata (`dict`)
``"ignore_errors"``
Ignore errors in transliteration (`bool`)
``"onmatch_rules_lookup"``
Dictionary keyed by current token to previous token
containing a list of indexes of applicable :class:`OnmatchRule`
to try
(`dict` of {`str`: `dict` of {`str`: `list` of `int`}})
``"tokens_by_class"``
Tokens keyed by token class, used internally
(`dict` of {`str`: `list` of str})
``"graph"``
Serialization of `DirectedGraph`
(`dict`)
``"tokenizer_pattern"``
Regular expression for tokenizing
(`str`)
``"graphtransliterator_version"``
Module version of `graphtransliterator` (`str`)
Example
-------
.. jupyter-execute::
yaml_ = '''
tokens:
a: [vowel]
' ': [wb]
rules:
a: A
' ': ' '
whitespace:
default: " "
consolidate: false
token_class: wb
onmatch_rules:
- <vowel> + <vowel>: ',' # add a comma between vowels
metadata:
author: "Author McAuthorson"
'''
gt = GraphTransliterator.from_yaml(yaml_)
gt.dump()
See Also
--------
dumps : Dump Graph Transliterator configuration to JSON string
load : Load Graph Transliteration from configuration in Python data types
loads : Load Graph Transliteration from configuration as a JSON string
""" # noqa
if compression_level == 0:
return GraphTransliteratorSchema().dump(self)
elif compression_level not in range(1, HIGHEST_COMPRESSION_LEVEL + 1):
raise ValueError(
f"Compression level must be between 0 and {HIGHEST_COMPRESSION_LEVEL}"
)
return {
"graphtransliterator_version": __version__,
"compressed_settings": compress_config(
GraphTransliteratorSchema().dump(self),
compression_level=compression_level,
),
}
[docs]
def dumps(self, compression_level=2):
"""
Parameters
----------
compression_level: `int`
A value in 0 (no compression), 1 (compression including graph),
and 2 (default, compression without graph)
separators: `tuple` of `str`
Separators used by json.dumps(), default is compact
Dump settings of Graph Transliterator to Javascript Object Notation (JSON).
Compression is turned on by default.
Returns
-------
`str`
JSON string
Examples
--------
.. jupyter-execute::
yaml_ = '''
tokens:
a: [vowel]
' ': [wb]
rules:
a: A
' ': ' '
whitespace:
default: " "
consolidate: false
token_class: wb
onmatch_rules:
- <vowel> + <vowel>: ',' # add a comma between vowels
metadata:
author: "Author McAuthorson"
'''
gt = GraphTransliterator.from_yaml(yaml_)
gt.dumps()
See Also
--------
dump : Dump Graph Transliterator configuration to Python data types
load : Load Graph Transliteration from configuration in Python data types
loads : Load Graph Transliteration from configuration as a JSON string
""" # noqa
if compression_level == 0:
return GraphTransliteratorSchema().dumps(self)
_config = self.dump(compression_level=compression_level)
return json.dumps(_config, separators=(",", ":"))
[docs]
def match_at(self, token_i, tokens, match_all=False):
"""
Match best (least costly) transliteration rule at a given index in the
input tokens and return the index to that rule. Optionally, return all
rules that match.
Parameters
----------
token_i : `int`
Location in `tokens` at which to begin
tokens : `list` of `str`
List of tokens
match_all : `bool`, optional
If true, return the index of all rules matching at the given
index. The default is false.
Returns
-------
`int`, `None`, or `list` of `int`
Index of matching transliteration rule in
:attr:`GraphTransliterator.rules` or None. Returns a `list` of
`int` or an empty `list` if ``match_all`` is true.
Note
----
Expects whitespaces token at beginning and end of `tokens`.
Examples
--------
.. jupyter-execute::
gt = GraphTransliterator.from_yaml('''
tokens:
a: []
a a: []
' ': [wb]
rules:
a: <A>
a a: <AA>
whitespace:
default: ' '
consolidate: True
token_class: wb
''')
tokens = gt.tokenize("aa")
tokens # whitespace added to ends
.. jupyter-execute::
gt.match_at(1, tokens) # returns index to rule
.. jupyter-execute::
gt.rules[gt.match_at(1, tokens)] # actual rule
.. jupyter-execute::
gt.match_at(1, tokens, match_all=True) # index to rules, with match_all
.. jupyter-execute::
[gt.rules[_] for _ in gt.match_at(1, tokens, match_all=True)]
""" # noqa
graph = self._graph
graph_node = graph.node
graph_edge = graph.edge
if match_all:
matches = []
stack = deque()
def _append_children(node_key, token_i):
children = None
ordered_children = graph_node[node_key].get("ordered_children")
if ordered_children:
children = ordered_children.get(tokens[token_i])
if children:
# reordered high to low for stack:
for child_key in reversed(children):
stack.appendleft((child_key, node_key, token_i))
else:
rules_keys = ordered_children.get("__rules__") # leafs
if rules_keys:
# There may be more than one rule, as certain rules have
# constraints on them.
# Reordered so higher cost go on stack last.
for rule_key in reversed(rules_keys):
stack.appendleft((rule_key, node_key, token_i))
_append_children(0, token_i) # Append all children of root node
while stack: # LIFO
node_key, parent_key, token_i = stack.popleft()
curr_node = graph_node[node_key]
# Constraints are only on preceding edge if it is accepting
# But edge is accessed regardless to test coverage
incident_edge = graph_edge[parent_key][node_key]
# Pass edge, curr_node, token index, and tokens to check constraints
if curr_node.get("accepting") and self._match_constraints(
incident_edge, curr_node, token_i, tokens
):
if match_all:
matches.append(curr_node["rule_key"])
continue
else:
return curr_node["rule_key"]
else:
if token_i < len(tokens) - 1:
token_i += 1
_append_children(node_key, token_i)
if match_all:
return matches
[docs]
def pruned_of(self, productions):
"""
Remove transliteration rules with specific output productions.
Parameters
----------
productions : `str`, or `list` of `str`
list of productions to remove
Returns
-------
graphtransliterator.GraphTransliterator
Graph transliterator pruned of certain productions.
Note
----
Uses original initialization parameters to construct a new
:class:`GraphTransliterator`.
Examples
--------
.. jupyter-execute::
gt = GraphTransliterator.from_yaml('''
tokens:
a: []
a a: []
' ': [wb]
rules:
a: <A>
a a: <AA>
whitespace:
default: ' '
consolidate: True
token_class: wb
''')
gt.rules
.. jupyter-execute::
gt.pruned_of('<AA>').rules
.. jupyter-execute::
gt.pruned_of(['<A>', '<AA>']).rules
""" # noqa
pruned_rules = [_ for _ in self._rules if _.production not in productions]
return GraphTransliterator(
self._tokens,
pruned_rules,
self._whitespace,
onmatch_rules=self._onmatch_rules,
metadata=self._metadata,
ignore_errors=self._ignore_errors,
check_ambiguity=self._check_ambiguity,
)
[docs]
def tokenize(self, input):
"""
Tokenizes an input string.
Adds initial and trailing whitespace, which can be consolidated.
Parameters
----------
input : str
String to tokenize
Returns
-------
`list` of `str`
List of tokens, with default whitespace token at beginning and end.
Raises
------
ValueError
Unrecognizable input, such as a character that is not in a token
Examples
--------
.. jupyter-execute::
tokens = {'ab': ['class_ab'], ' ': ['wb']}
whitespace = {'default': ' ', 'token_class': 'wb', 'consolidate': True}
rules = {'ab': 'AB', ' ': '_'}
settings = {'tokens': tokens, 'rules': rules, 'whitespace': whitespace}
gt = GraphTransliterator.from_easyreading_dict(settings)
gt.tokenize('ab ')
"""
def is_whitespace(token):
"""Check if token is whitespace."""
return self.whitespace.token_class in self.tokens[token]
# start with a whitespace token
tokens = [self.whitespace.default]
prev_whitespace = True
match_at = 0
while match_at < len(input):
match = self._tokenizer.match(input, match_at)
if match:
match_at = match.end() # advance match_at
token = match.group(0)
# Could save match loc here:
# matched_at = match.span(0)[0]
if is_whitespace(token):
if prev_whitespace and self.whitespace.consolidate:
continue
else:
prev_whitespace = True
else:
prev_whitespace = False
tokens.append(token)
else:
logger.warning(
"Unrecognizable token %s at pos %s of %s"
% (input[match_at], match_at, input)
)
if not self.ignore_errors:
raise UnrecognizableInputTokenException
else:
match_at += 1
if self.whitespace.consolidate:
while len(tokens) > 1 and is_whitespace(tokens[-1]):
tokens.pop()
tokens.append(self.whitespace.default)
return tokens
[docs]
def transliterate(self, input):
"""
Transliterate an input string into an output string.
Parameters
----------
input : `str`
Input string to transliterate
Returns
-------
`str`
Transliteration output string
Raises
------
ValueError
Cannot parse input
Note
----
Whitespace will be temporarily appended to start and end of input
string.
Example
-------
.. jupyter-execute::
GraphTransliterator.from_yaml(
'''
tokens:
a: []
' ': [wb]
rules:
a: A
' ': '_'
whitespace:
default: ' '
consolidate: True
token_class: wb
''').transliterate("a a")
"""
tokens = self.tokenize(input) # Adds initial+final whitespace
self._input_tokens = tokens # Tokens are saved here
self._rule_keys = [] # Matched ule keys are saved here
output = ""
token_i = 1 # Adjust for initial whitespace
while token_i < len(tokens) - 1: # Adjust for final whitespace
rule_key = self.match_at(token_i, tokens)
if rule_key is None:
logger.warning(
"No matching transliteration rule at token pos %s of %s"
% (token_i, tokens)
)
# No parsing rule was found at this location
if self.ignore_errors:
# Move along if ignoring errors
token_i += 1
continue
else:
raise NoMatchingTransliterationRuleException
self._rule_keys.append(rule_key)
rule = self.rules[rule_key]
tokens_matched = rule.tokens
if self._onmatch_rules:
curr_match_rules = None
prev_t = tokens[token_i - 1]
curr_t = tokens[token_i]
curr_t_rules = self._onmatch_rules_lookup.get(curr_t)
if curr_t_rules:
curr_match_rules = curr_t_rules.get(prev_t)
if curr_match_rules:
for onmatch_i in curr_match_rules:
onmatch = self._onmatch_rules[onmatch_i]
# <class_a> <class_a> + <class_b>
# a a b
# ^
# ^ - len(onmatch.prev_rules)
if self._match_tokens(
token_i - len(onmatch.prev_classes),
onmatch.prev_classes, # Checks last value
tokens,
check_prev=True,
check_next=False,
by_class=True,
) and self._match_tokens(
token_i,
onmatch.next_classes, # Checks first value
tokens,
check_prev=False,
check_next=True,
by_class=True,
):
output += onmatch.production
break # Only match best onmatch rule
output += rule.production
token_i += len(tokens_matched)
return output
# ---------- static methods ----------
[docs]
@staticmethod
def from_dict(dict_settings, **kwargs):
"""Generate GraphTransliterator from `dict` settings.
Parameters
----------
dict_settings : `dict`
Dictionary of settings
Returns
-------
GraphTransliterator
Graph transliterator
"""
settings = SettingsSchema().load(dict_settings)
args = [settings["tokens"], settings["rules"], settings["whitespace"]]
kwargs = {
"onmatch_rules": settings.get("onmatch_rules"),
"metadata": settings.get("metadata"),
"tokens_by_class": settings.get("tokens_by_class"), # will be generated
"graph": settings.get("graph"), # will be generated
"tokenizer_pattern": settings.get("tokenizer_pattern"), # will be generated
"ignore_errors": kwargs.get("ignore_errors", False),
"check_ambiguity": kwargs.get("check_ambiguity", True),
}
return GraphTransliterator(*args, **kwargs)
[docs]
@staticmethod
def from_easyreading_dict(easyreading_settings, **kwargs):
"""
Constructs `GraphTransliterator` from a dictionary of settings in
"easy reading" format, i.e. the loaded contents of a YAML string.
Parameters
----------
easyreading_settings : `dict`
Settings dictionary in easy reading format with keys:
``"tokens"``
Mappings of tokens to their classes
(`dict` of {str: `list` of `str`})
``"rules"``
Transliteration rules in "easy reading" format
(`list` of `dict` of {`str`: `str`})
``"onmatch_rules"``
On match rules in "easy reading" format
(`dict` of {`str`: `str`}, optional)
``"whitespace"``
Whitespace definitions, including default whitespace token,
class of whitespace tokens, and whether or not to consolidate
(`dict` of {'default': `str`, 'token_class': `str`,
consolidate: `bool`}, optional)
``"metadata"``
Dictionary of metadata (`dict`, optional)
Returns
-------
GraphTransliterator
Graph Transliterator
Note
----
Called by :meth:`from_yaml`.
Example
-------
.. jupyter-execute::
tokens = {
'ab': ['class_ab'],
' ': ['wb']
}
whitespace = {
'default': ' ',
'token_class': 'wb',
'consolidate': True
}
onmatch_rules = [
{'<class_ab> + <class_ab>': ','}
]
rules = {'ab': 'AB',
' ': '_'}
settings = {'tokens': tokens,
'rules': rules,
'whitespace': whitespace,
'onmatch_rules': onmatch_rules}
gt = GraphTransliterator.from_easyreading_dict(settings)
gt.transliterate("ab abab")
See Also
--------
from_yaml : Constructor from YAML string in "easy reading" format
from_yaml_file : Constructor from YAML file in "easy reading" format
"""
# Validate easyreading settings
_ = EasyReadingSettingsSchema().load(easyreading_settings)
# Convert those to regular settings
_ = _process_easyreading_settings(_)
# Validation of regular settings is done in from_dict
return GraphTransliterator.from_dict(_, **kwargs)
[docs]
@staticmethod
def from_yaml(yaml_str, charnames_escaped=True, **kwargs):
"""
Construct GraphTransliterator from a YAML str.
Parameters
----------
yaml_str : str
YAML mappings of tokens, rules, and (optionally) onmatch_rules
charnames_escaped : boolean
Unescape Unicode during YAML read (default True)
Note
----
Called by :meth:`from_yaml_file` and calls :meth:`from_easyreading_dict`.
Example
-------
.. jupyter-execute::
yaml_ = '''
tokens:
a: [class1]
' ': [wb]
rules:
a: A
' ': ' '
whitespace:
default: ' '
consolidate: True
token_class: wb
onmatch_rules:
- <class1> + <class1>: "+"
'''
gt = GraphTransliterator.from_yaml(yaml_)
gt.transliterate("a aa")
See Also
--------
from_easyreading_dict : Constructor from dictionary in "easy reading" format
from_yaml : Constructor from YAML string in "easy reading" format
from_yaml_file : Constructor from YAML file in "easy reading" format
"""
if charnames_escaped:
yaml_str = _unescape_charnames(yaml_str)
settings = yaml.safe_load(yaml_str)
return GraphTransliterator.from_easyreading_dict(settings, **kwargs)
[docs]
@staticmethod
def from_yaml_file(yaml_filename, **kwargs):
"""
Construct GraphTransliterator from YAML file.
Parameters
----------
yaml_filename : str
Name of YAML file, containing tokens, rules, and (optionally)
onmatch_rules
Note
----
Calls :meth:`from_yaml`.
See Also
--------
from_yaml : Constructor from YAML string in "easy reading" format
from_easyreading_dict : Constructor from dictionary in "easy reading" format
"""
with open(yaml_filename, "r") as f:
yaml_string = f.read()
return GraphTransliterator.from_yaml(yaml_string, **kwargs)
[docs]
@staticmethod
def load(settings, **kwargs):
"""Create GraphTransliterator from settings as Python data types.
Parameters
----------
settings
GraphTransliterator configuration as a dictionary with keys:
``"tokens"``
Mappings of tokens to their classes
(`dict` of {str: `list` of `str`})
``"rules"``
Transliteration rules in direct format
(`list` of `OrderedDict` of {`str`: `str`})
``"whitespace"``
Whitespace settings
(`dict` of {`str`: `str`})
``"onmatch_rules"``
On match rules
(`list` of `OrderedDict`, optional)
``"metadata"``
Dictionary of metadata (`dict`, optional)
``"ignore_errors"``
Ignore errors. (`bool`, optional)
``"onmatch_rules_lookup"``
Dictionary keyed by current token to previous token
containing a list of indexes of applicable :class:`OnmatchRule`
to try
(`dict` of {`str`: `dict` of {`str`: `list` of `int`}}, optional)
``tokens_by_class``
Tokens keyed by token class, used internally
(`dict` of {`str`: `list` of str}, optional)
``graph``
Serialization of `DirectedGraph`
(`dict`, optional)
``"tokenizer_pattern"``
Regular expression for tokenizing
(`str`, optional)
``"graphtransliterator_version"``
Module version of `graphtransliterator` (`str`, optional)
Returns
-------
GraphTransliterator
Graph Transliterator
Example
-------
.. jupyter-execute::
from collections import OrderedDict
settings = \
{'tokens': {'a': ['vowel'], ' ': ['wb']},
'rules': [OrderedDict([('production', 'A'),
# Can be compacted, removing None values
# ('prev_tokens', None),
('tokens', ['a']),
('next_classes', None),
('next_tokens', None),
('cost', 0.5849625007211562)]),
OrderedDict([('production', ' '),
('prev_classes', None),
('prev_tokens', None),
('tokens', [' ']),
('next_classes', None),
('next_tokens', None),
('cost', 0.5849625007211562)])],
'whitespace': {'default': ' ', 'token_class': 'wb', 'consolidate': False},
'onmatch_rules': [OrderedDict([('prev_classes', ['vowel']),
('next_classes', ['vowel']),
('production', ',')])],
'metadata': {'author': 'Author McAuthorson'},
'onmatch_rules_lookup': {'a': {'a': [0]}},
'tokens_by_class': {'vowel': ['a'], 'wb': [' ']},
'graph': {'edge': {0: {1: {'token': 'a', 'cost': 0.5849625007211562},
3: {'token': ' ', 'cost': 0.5849625007211562}},
1: {2: {'cost': 0.5849625007211562}},
3: {4: {'cost': 0.5849625007211562}}},
'node': [{'type': 'Start', 'ordered_children': {'a': [1], ' ': [3]}},
{'type': 'token', 'token': 'a', 'ordered_children': {'__rules__': [2]}},
{'type': 'rule',
'rule_key': 0,
'accepting': True,
'ordered_children': {}},
{'type': 'token', 'token': ' ', 'ordered_children': {'__rules__': [4]}},
{'type': 'rule',
'rule_key': 1,
'accepting': True,
'ordered_children': {}}],
'edge_list': [(0, 1), (1, 2), (0, 3), (3, 4)]},
'tokenizer_pattern': '(a|\\ )',
'graphtransliterator_version': '0.3.3'}
gt = GraphTransliterator.load(settings)
gt.transliterate('aa')
.. jupyter-execute::
# can be compacted
settings.pop('onmatch_rules_lookup')
GraphTransliterator.load(settings).transliterate('aa')
See Also
--------
dump : Dump Graph Transliterator configuration to Python data types
dumps : Dump Graph Transliterator configuration to JSON string
loads : Load Graph Transliteration from configuration as a JSON string
""" # noqa
# combine kwargs with settings
return GraphTransliteratorSchema().load(dict(settings, **kwargs))
[docs]
@staticmethod
def loads(settings, **kwargs):
"""Create GraphTransliterator from JavaScript Object Notation (JSON) string.
Parameters
----------
settings
JSON settings for GraphTransliterator
Returns
-------
GraphTransliterator
Graph Transliterator
Example
-------
.. jupyter-execute::
JSON_settings = '''{"tokens": {"a": ["vowel"], " ": ["wb"]}, "rules": [{"production": "A", "prev_classes": null, "prev_tokens": null, "tokens": ["a"], "next_classes": null, "next_tokens": null, "cost": 0.5849625007211562}, {"production": " ", "prev_classes": null, "prev_tokens": null, "tokens": [" "], "next_classes": null, "next_tokens": null, "cost": 0.5849625007211562}], "whitespace": {"default": " ", "token_class": "wb", "consolidate": false}, "onmatch_rules": [{"prev_classes": ["vowel"], "next_classes": ["vowel"], "production": ","}], "metadata": {"author": "Author McAuthorson"}, "ignore_errors": false, "onmatch_rules_lookup": {"a": {"a": [0]}}, "tokens_by_class": {"vowel": ["a"], "wb": [" "]}, "graph": {"node": [{"type": "Start", "ordered_children": {"a": [1], " ": [3]}}, {"type": "token", "token": "a", "ordered_children": {"__rules__": [2]}}, {"type": "rule", "rule_key": 0, "accepting": true, "ordered_children": {}}, {"type": "token", "token": " ", "ordered_children": {"__rules__": [4]}}, {"type": "rule", "rule_key": 1, "accepting": true, "ordered_children": {}}], "edge": {"0": {"1": {"token": "a", "cost": 0.5849625007211562}, "3": {"token": " ", "cost": 0.5849625007211562}}, "1": {"2": {"cost": 0.5849625007211562}}, "3": {"4": {"cost": 0.5849625007211562}}}, "edge_list": [[0, 1], [1, 2], [0, 3], [3, 4]]}, "tokenizer_pattern": "(a| )", "graphtransliterator_version": "1.2.2"}'''
gt = GraphTransliterator.loads(JSON_settings)
gt.transliterate('a')
See Also
--------
dump : Dump Graph Transliterator configuration to Python data types
dumps : Dump Graph Transliterator configuration to JSON string
load : Load Graph Transliteration from configuration in Python data types
""" # noqa
# combine kwargs with settings
_settings = dict(json.loads(settings), **kwargs)
return GraphTransliteratorSchema().load(_settings)
[docs]
class CoverageTransliterator(GraphTransliterator):
"""Subclass of GraphTransliterator that logs visits to graph and on_match rules.
Used to confirm that tests cover the entire graph and onmatch_rules."""
def __init__(self, *args, **kwargs):
# Initialize from GraphTransliterator
GraphTransliterator.__init__(self, *args, **kwargs)
# Convert _graph and _onmatch_rules to visit-tracking objects
self._graph = VisitLoggingDirectedGraph(self._graph)
self._onmatch_rules = VisitLoggingList(self._onmatch_rules)
[docs]
def clear_visited(self):
"""Clear visited flags from graph and onmatch_rules."""
self._graph.clear_visited()
if self._onmatch_rules:
self._onmatch_rules.clear_visited()
[docs]
def check_onmatchrules_coverage(self, raise_exception=True):
"""Check coverage of onmatch rules."""
errors = []
onmatch_rules = self._onmatch_rules
for i, onmatch_rule in enumerate(onmatch_rules.data): # data to avoid visited
if i not in onmatch_rules.visited:
logger.warning(
"On Match Rule {} [{}] has not been visited.".format(
i, onmatch_rule
)
)
errors.append(i)
if errors and raise_exception:
error_msg = "Missed OnMatchRules: " + ",".join([str(i) for i in errors])
raise IncompleteOnMatchRulesCoverageException(error_msg)
return not errors
[docs]
def check_coverage(self, raise_exception=True):
"""Check coverage of graph and onmatch rules.
First checks graph coverage, then checks onmatch rules."""
return self._graph.check_coverage(
raise_exception=raise_exception
) and self.check_onmatchrules_coverage(raise_exception=raise_exception)