mh
/
qmk_firmware
mirror of https://github.com/qmk/qmk_firmware/


								# Copyright 2021 Google LLC

								#

								# Licensed under the Apache License, Version 2.0 (the "License");

								# you may not use this file except in compliance with the License.

								# You may obtain a copy of the License at

								#

								#     https://www.apache.org/licenses/LICENSE-2.0

								#

								# Unless required by applicable law or agreed to in writing, software

								# distributed under the License is distributed on an "AS IS" BASIS,

								# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

								# See the License for the specific language governing permissions and

								# limitations under the License.

								"""Python program to make autocorrect_data.h.

								This program reads from a prepared dictionary file and generates a C source file

								"autocorrect_data.h" with a serialized trie embedded as an array. Run this

								program and pass it as the first argument like:

								$ qmk generate-autocorrect-data autocorrect_dict.txt

								Each line of the dict file defines one typo and its correction with the syntax

								"typo -> correction". Blank lines or lines starting with '#' are ignored.

								Example:

								  :thier        -> their

								  fitler        -> filter

								  lenght        -> length

								  ouput         -> output

								  widht         -> width

								For full documentation, see QMK Docs

								"""


								import sys

								import textwrap

								from typing import Any, Dict, Iterator, List, Tuple


								from milc import cli


								from qmk.commands import dump_lines

								from qmk.constants import GPL2_HEADER_C_LIKE, GENERATED_HEADER_C_LIKE

								from qmk.keyboard import keyboard_completer, keyboard_folder

								from qmk.keymap import keymap_completer, locate_keymap

								from qmk.path import normpath


								KC_A = 4

								KC_SPC = 0x2c

								KC_QUOT = 0x34


								TYPO_CHARS = dict([

								    ("'", KC_QUOT),

								    (':', KC_SPC),  # "Word break" character.

								] + [(chr(c), c + KC_A - ord('a')) for c in range(ord('a'),

								                                                  ord('z') + 1)])  # Characters a-z.


								def parse_file(file_name: str) -> List[Tuple[str, str]]:

								    """Parses autocorrections dictionary file.

								  Each line of the file defines one typo and its correction with the syntax

								  "typo -> correction". Blank lines or lines starting with '#' are ignored. The

								  function validates that typos only have characters a-z and that typos are not

								  substrings of other typos, otherwise the longer typo would never trigger.

								  Args:

								    file_name: String, path of the autocorrections dictionary.

								  Returns:

								    List of (typo, correction) tuples.

								  """


								    try:

								        import english_words

								        correct_words = english_words.get_english_words_set(['web2'], lower=True, alpha=True)

								    except AttributeError:

								        from english_words import english_words_lower_alpha_set as correct_words

								        if not cli.args.quiet:

								            cli.echo('The english_words package is outdated, update by running:')

								            cli.echo('  {fg_cyan}python3 -m pip install english_words --upgrade')

								    except ImportError:

								        if not cli.args.quiet:

								            cli.echo('Autocorrection will falsely trigger when a typo is a substring of a correctly spelled word.')

								            cli.echo('To check for this, install the english_words package and rerun this script:')

								            cli.echo('  {fg_cyan}python3 -m pip install english_words')

								        # Use a minimal word list as a fallback.

								        correct_words = ('information', 'available', 'international', 'language', 'loosest', 'reference', 'wealthier', 'entertainment', 'association', 'provides', 'technology', 'statehood')


								    autocorrections = []

								    typos = set()

								    for line_number, typo, correction in parse_file_lines(file_name):

								        if typo in typos:

								            cli.log.warning('{fg_red}Error:%d:{fg_reset} Ignoring duplicate typo: "{fg_cyan}%s{fg_reset}"', line_number, typo)

								            continue


								        # Check that `typo` is valid.

								        if not (all([c in TYPO_CHARS for c in typo])):

								            cli.log.error('{fg_red}Error:%d:{fg_reset} Typo "{fg_cyan}%s{fg_reset}" has characters other than a-z, \' and :.', line_number, typo)

								            sys.exit(1)

								        for other_typo in typos:

								            if typo in other_typo or other_typo in typo:

								                cli.log.error('{fg_red}Error:%d:{fg_reset} Typos may not be substrings of one another, otherwise the longer typo would never trigger: "{fg_cyan}%s{fg_reset}" vs. "{fg_cyan}%s{fg_reset}".', line_number, typo, other_typo)

								                sys.exit(1)

								        if len(typo) < 5:

								            cli.log.warning('{fg_yellow}Warning:%d:{fg_reset} It is suggested that typos are at least 5 characters long to avoid false triggers: "{fg_cyan}%s{fg_reset}"', line_number, typo)

								        if len(typo) > 127:

								            cli.log.error('{fg_red}Error:%d:{fg_reset} Typo exceeds 127 chars: "{fg_cyan}%s{fg_reset}"', line_number, typo)

								            sys.exit(1)


								        check_typo_against_dictionary(typo, line_number, correct_words)


								        autocorrections.append((typo, correction))

								        typos.add(typo)


								    return autocorrections


								def make_trie(autocorrections: List[Tuple[str, str]]) -> Dict[str, Any]:

								    """Makes a trie from the the typos, writing in reverse.

								  Args:

								    autocorrections: List of (typo, correction) tuples.

								  Returns:

								    Dict of dict, representing the trie.

								  """

								    trie = {}

								    for typo, correction in autocorrections:

								        node = trie

								        for letter in typo[::-1]:

								            node = node.setdefault(letter, {})

								        node['LEAF'] = (typo, correction)


								    return trie


								def parse_file_lines(file_name: str) -> Iterator[Tuple[int, str, str]]:

								    """Parses lines read from `file_name` into typo-correction pairs."""


								    line_number = 0

								    for line in open(file_name, 'rt'):

								        line_number += 1

								        line = line.strip()

								        if line and line[0] != '#':

								            # Parse syntax "typo -> correction", using strip to ignore indenting.

								            tokens = [token.strip() for token in line.split('->', 1)]

								            if len(tokens) != 2 or not tokens[0]:

								                print(f'Error:{line_number}: Invalid syntax: "{line}"')

								                sys.exit(1)


								            typo, correction = tokens

								            typo = typo.lower()  # Force typos to lowercase.

								            typo = typo.replace(' ', ':')


								            yield line_number, typo, correction


								def check_typo_against_dictionary(typo: str, line_number: int, correct_words) -> None:

								    """Checks `typo` against English dictionary words."""


								    if typo.startswith(':') and typo.endswith(':'):

								        if typo[1:-1] in correct_words:

								            cli.log.warning('{fg_yellow}Warning:%d:{fg_reset} Typo "{fg_cyan}%s{fg_reset}" is a correctly spelled dictionary word.', line_number, typo)

								    elif typo.startswith(':') and not typo.endswith(':'):

								        for word in correct_words:

								            if word.startswith(typo[1:]):

								                cli.log.warning('{fg_yellow}Warning:%d: {fg_reset}Typo "{fg_cyan}%s{fg_reset}" would falsely trigger on correctly spelled word "{fg_cyan}%s{fg_reset}".', line_number, typo, word)

								    elif not typo.startswith(':') and typo.endswith(':'):

								        for word in correct_words:

								            if word.endswith(typo[:-1]):

								                cli.log.warning('{fg_yellow}Warning:%d:{fg_reset} Typo "{fg_cyan}%s{fg_reset}" would falsely trigger on correctly spelled word "{fg_cyan}%s{fg_reset}".', line_number, typo, word)

								    elif not typo.startswith(':') and not typo.endswith(':'):

								        for word in correct_words:

								            if typo in word:

								                cli.log.warning('{fg_yellow}Warning:%d:{fg_reset} Typo "{fg_cyan}%s{fg_reset}" would falsely trigger on correctly spelled word "{fg_cyan}%s{fg_reset}".', line_number, typo, word)


								def serialize_trie(autocorrections: List[Tuple[str, str]], trie: Dict[str, Any]) -> List[int]:

								    """Serializes trie and correction data in a form readable by the C code.

								  Args:

								    autocorrections: List of (typo, correction) tuples.

								    trie: Dict of dicts.

								  Returns:

								    List of ints in the range 0-255.

								  """

								    table = []


								    # Traverse trie in depth first order.

								    def traverse(trie_node):

								        if 'LEAF' in trie_node:  # Handle a leaf trie node.

								            typo, correction = trie_node['LEAF']

								            word_boundary_ending = typo[-1] == ':'

								            typo = typo.strip(':')

								            i = 0  # Make the autocorrection data for this entry and serialize it.

								            while i < min(len(typo), len(correction)) and typo[i] == correction[i]:

								                i += 1

								            backspaces = len(typo) - i - 1 + word_boundary_ending

								            assert 0 <= backspaces <= 63

								            correction = correction[i:]

								            bs_count = [backspaces + 128]

								            data = bs_count + list(bytes(correction, 'ascii')) + [0]


								            entry = {'data': data, 'links': [], 'byte_offset': 0}

								            table.append(entry)

								        elif len(trie_node) == 1:  # Handle trie node with a single child.

								            c, trie_node = next(iter(trie_node.items()))

								            entry = {'chars': c, 'byte_offset': 0}


								            # It's common for a trie to have long chains of single-child nodes. We

								            # find the whole chain so that we can serialize it more efficiently.

								            while len(trie_node) == 1 and 'LEAF' not in trie_node:

								                c, trie_node = next(iter(trie_node.items()))

								                entry['chars'] += c


								            table.append(entry)

								            entry['links'] = [traverse(trie_node)]

								        else:  # Handle trie node with multiple children.

								            entry = {'chars': ''.join(sorted(trie_node.keys())), 'byte_offset': 0}

								            table.append(entry)

								            entry['links'] = [traverse(trie_node[c]) for c in entry['chars']]

								        return entry


								    traverse(trie)


								    def serialize(e: Dict[str, Any]) -> List[int]:

								        if not e['links']:  # Handle a leaf table entry.

								            return e['data']

								        elif len(e['links']) == 1:  # Handle a chain table entry.

								            return [TYPO_CHARS[c] for c in e['chars']] + [0]  # + encode_link(e['links'][0]))

								        else:  # Handle a branch table entry.

								            data = []

								            for c, link in zip(e['chars'], e['links']):

								                data += [TYPO_CHARS[c] | (0 if data else 64)] + encode_link(link)

								            return data + [0]


								    byte_offset = 0

								    for e in table:  # To encode links, first compute byte offset of each entry.

								        e['byte_offset'] = byte_offset

								        byte_offset += len(serialize(e))

								        assert 0 <= byte_offset <= 0xffff


								    return [b for e in table for b in serialize(e)]  # Serialize final table.


								def encode_link(link: Dict[str, Any]) -> List[int]:

								    """Encodes a node link as two bytes."""

								    byte_offset = link['byte_offset']

								    if not (0 <= byte_offset <= 0xffff):

								        cli.log.error('{fg_red}Error:{fg_reset} The autocorrection table is too large, a node link exceeds 64KB limit. Try reducing the autocorrection dict to fewer entries.')

								        sys.exit(1)

								    return [byte_offset & 255, byte_offset >> 8]


								def typo_len(e: Tuple[str, str]) -> int:

								    return len(e[0])


								def to_hex(b: int) -> str:

								    return f'0x{b:02X}'


								@cli.argument('filename', type=normpath, help='The autocorrection database file')

								@cli.argument('-kb', '--keyboard', type=keyboard_folder, completer=keyboard_completer, help='The keyboard to build a firmware for. Ignored when a configurator export is supplied.')

								@cli.argument('-km', '--keymap', completer=keymap_completer, help='The keymap to build a firmware for. Ignored when a configurator export is supplied.')

								@cli.argument('-o', '--output', arg_only=True, type=normpath, help='File to write to')

								@cli.argument('-q', '--quiet', arg_only=True, action='store_true', help="Quiet mode, only output error messages")

								@cli.subcommand('Generate the autocorrection data file from a dictionary file.')

								def generate_autocorrect_data(cli):

								    autocorrections = parse_file(cli.args.filename)

								    trie = make_trie(autocorrections)

								    data = serialize_trie(autocorrections, trie)


								    current_keyboard = cli.args.keyboard or cli.config.user.keyboard or cli.config.generate_autocorrect_data.keyboard

								    current_keymap = cli.args.keymap or cli.config.user.keymap or cli.config.generate_autocorrect_data.keymap


								    if current_keyboard and current_keymap:

								        cli.args.output = locate_keymap(current_keyboard, current_keymap).parent / 'autocorrect_data.h'


								    assert all(0 <= b <= 255 for b in data)


								    min_typo = min(autocorrections, key=typo_len)[0]

								    max_typo = max(autocorrections, key=typo_len)[0]


								    # Build the autocorrect_data.h file.

								    autocorrect_data_h_lines = [GPL2_HEADER_C_LIKE, GENERATED_HEADER_C_LIKE, '#pragma once', '']


								    autocorrect_data_h_lines.append(f'// Autocorrection dictionary ({len(autocorrections)} entries):')

								    for typo, correction in autocorrections:

								        autocorrect_data_h_lines.append(f'//   {typo:<{len(max_typo)}} -> {correction}')


								    autocorrect_data_h_lines.append('')

								    autocorrect_data_h_lines.append(f'#define AUTOCORRECT_MIN_LENGTH {len(min_typo)} // "{min_typo}"')

								    autocorrect_data_h_lines.append(f'#define AUTOCORRECT_MAX_LENGTH {len(max_typo)} // "{max_typo}"')

								    autocorrect_data_h_lines.append(f'#define DICTIONARY_SIZE {len(data)}')

								    autocorrect_data_h_lines.append('')

								    autocorrect_data_h_lines.append('static const uint8_t autocorrect_data[DICTIONARY_SIZE] PROGMEM = {')

								    autocorrect_data_h_lines.append(textwrap.fill('    %s' % (', '.join(map(to_hex, data))), width=100, subsequent_indent='    '))

								    autocorrect_data_h_lines.append('};')


								    # Show the results

								    dump_lines(cli.args.output, autocorrect_data_h_lines, cli.args.quiet)