ohmyzsh/plugins/emoji/update_emoji.py

"""
Update Emoji.py
Refeshes OMZ emoji database based on the latest Unicode spec
"""
import re
import json

spec = open("emoji-data.txt", "r")

# Regexes
# regex_emoji will return, respectively:
# the code points, its type (status), the actual emoji, and its official name
regex_emoji = r"^([\w ].*?\S)\s*;\s*([\w-]+)\s*#\s*(.*?)\s(\S.*).*$"
# regex_group returns the group of subgroup that a line opens
regex_group = r"^#\s*(group|subgroup):\s*(.*)$"

headers = """
# emoji-char-definitions.zsh - Emoji definitions for oh-my-zsh emoji plugin
#
# This file is auto-generated by update_emoji.py. Do not edit it manually.
#
# This contains the definition for:
#   $emoji         - which maps character names to Unicode characters
#   $emoji_flags   - maps country names to Unicode flag characters using region
#                    indicators
#   $emoji_mod     - maps modifier components to Unicode characters
#   $emoji_groups  - a single associative array to avoid cluttering up the
#                    global namespace, and to allow adding additional group
#                    definitions at run time. The keys are the group names, and
#                    the values are whitespace-separated lists of emoji
#                    character names.

# Main emoji
typeset -gAH emoji
# National flags
typeset -gAH emoji_flags
# Combining modifiers
typeset -gAH emoji_mod
# Emoji groups
typeset -gAH emoji_groups
"""

#######
# Adding country codes
#######
# This is the only part of this script that relies on an external library
# (country_converter), and is hence commented out by default.
# You can uncomment it to have country codes added as aliases for flag
# emojis. (By default, when you install this extension, country codes are
# included as aliases, but not if you re-run this script without uncommenting.)
# Warning: country_converter is very verbose, and will print warnings all over
# your terminal.

# import country_converter as coco # pylint: disable=wrong-import-position
# cc = coco.CountryConverter()

# def country_iso(_all_names, _omz_name):
#     """ Using the external library country_converter,
#         this funciton can detect the ISO2 and ISO3 codes
#         of the country. It takes as argument the array
#         with all the names of the emoji, and returns that array."""
#     omz_no_underscore = re.sub(r'_', r' ', _omz_name)
#     iso2 = cc.convert(names=[omz_no_underscore], to='ISO2')
#     if iso2 != 'not found':
#         _all_names.append(iso2)
#         iso3 = cc.convert(names=[omz_no_underscore], to='ISO3')
#         _all_names.append(iso3)
#     return _all_names


#######
# Helper functions
#######

def code_to_omz(_code_points):
    """ Returns a ZSH-compatible Unicode string from the code point(s) """
    return r'\U' + r'\U'.join(_code_points.split(' '))

def name_to_omz(_name, _group, _subgroup, _status):
    """ Returns a reasonable snake_case name for the emoji. """
    def snake_case(_string):
        """ Does the regex work of snake_case """
        remove_dots = re.sub(r'\.\(\)', r'', _string)
        replace_ands = re.sub(r'\&', r'and', remove_dots)
        remove_whitespace = re.sub(r'[^\#\*\w]', r'_', replace_ands)
        return re.sub(r'__', r'_', remove_whitespace)

    shortname = ""
    split_at_colon = lambda s: s.split(": ")
    # Special treatment by group and subgroup
    # If the emoji is a flag, we strip "flag" from its name
    if _group == "Flags" and len(split_at_colon(_name)) > 1:
        shortname = snake_case(split_at_colon(_name)[1])
    else:
        shortname = snake_case(_name)
    # Special treatment by status
    # Enables us to have every emoji combination,
    # even the one that are not officially sanctionned
    # and are implemeted by, say, only one vendor
    if _status == "unqualified":
        shortname += "_unqualified"
    elif _status == "minimally-qualified":
        shortname += "_minimally"
    return shortname

def increment_name(_shortname):
    """ Increment the short name by 1. If you get, say,
    'woman_detective_unqualified', it returns
    'woman_detective_unqualified_1', and then
    'woman_detective_unqualified_2', etc. """
    last_char = _shortname[-1]
    if last_char.isdigit():
        num = int(last_char)
        return _shortname[:-1] + str(num + 1)
    return _shortname + "_1"

########
# Going through every line
########

group, subgroup, short_name_buffer = "", "", ""
emoji_database = []
for line in spec:
    # First, test if this line opens a group or subgroup
    group_match = re.findall(regex_group, line)
    if group_match != []:
        gr_or_sub, name = group_match[0]
        if gr_or_sub == "group":
            group = name
        elif gr_or_sub == "subgroup":
            subgroup = name
        continue # Moving on...
    # Second, test if this line references one emoji
    emoji_match = re.findall(regex_emoji, line)
    if emoji_match != []:
        code_points, status, emoji, name = emoji_match[0]
        omz_codes = code_to_omz(code_points)
        omz_name = name_to_omz(name, group, subgroup, status)
        # If this emoji has the same shortname as the preceding one
        if omz_name in short_name_buffer:
            omz_name = increment_name(short_name_buffer)
        short_name_buffer = omz_name
        emoji_database.append(
            [omz_codes, status, emoji, omz_name, group, subgroup])
spec.close()

########
# Write to emoji-char-definitions.zsh
########

# Aliases for emojis are retrieved through the DB of Gemoji
# Retrieved on Aug 9 2019 from the following URL:
# https://raw.githubusercontent.com/github/gemoji/master/db/emoji.json

gemoji_db = open("gemoji_db.json")
j = json.load(gemoji_db)
aliases_map = {entry['emoji']: entry['aliases'] for entry in j}
all_omz_names = [emoji_data[3] for emoji_data in emoji_database]

# Let's begin writing to this file
output = open("emoji-char-definitions.zsh", "w")
output.write(headers)

emoji_groups = {"fruits": "\n", "vehicles": "\n", "hands": "\n",
                "people": "\n", "animals": "\n", "faces": "\n",
                "flags": "\n"}

# First, write every emoji down
for _omz_codes, _status, _emoji, _omz_name, _group, _subgroup in emoji_database:

    # One emoji can be mapped to multiple names (aliases or country codes)
    names_for_this_emoji = [_omz_name]

    # Variable that indicates in which map the emoji will be located
    emoji_map = "emoji"
    if _status == "component":
        emoji_map = "emoji_mod"
    if _group == "Flags":
        emoji_map = "emoji_flags"
        # Adding country codes (Optional, see above)
        # names_for_this_emoji = country_iso(names_for_this_emoji, _omz_name)

    # Check if there is an alias available in the Gemoji DB
    if _emoji in aliases_map.keys():
        for alias in aliases_map[_emoji]:
            if alias not in all_omz_names:
                names_for_this_emoji.append(alias)

    # And now we write to the definitions file
    for one_name in names_for_this_emoji:
        output.write(f"{emoji_map}[{one_name}]=$'{_omz_codes}'\n")

    # Storing the emoji in defined subgroups for the next step
    if _status == "fully-qualified":
        if _subgroup == "food-fruit":
            emoji_groups["fruits"] += f"  {_omz_name}\n"
        elif "transport-" in _subgroup:
            emoji_groups["vehicles"] += f"  {_omz_name}\n"
        elif "hand-" in _subgroup:
            emoji_groups["hands"] += f"  {_omz_name}\n"
        elif "person-" in _subgroup or _subgroup == "family":
            emoji_groups["people"] += f"  {_omz_name}\n"
        elif "animal-" in _subgroup:
            emoji_groups["animals"] += f"  {_omz_name}\n"
        elif "face-" in _subgroup:
            emoji_groups["faces"] += f"  {_omz_name}\n"
        elif _group == "Flags":
            emoji_groups["flags"] += f"  {_omz_name}\n"

# Second, write the subgroups to the end of the file
for name, string in emoji_groups.items():
    output.write(f'\nemoji_groups[{name}]="{string}"\n')
output.close()
refactor(emoji): rewrite script w/ Python and update emojis (#8069) Closes #8069 2019-08-10 01:56:57 +00:00			`"""`
			`Update Emoji.py`
			`Refeshes OMZ emoji database based on the latest Unicode spec`
			`"""`
			`import re`
			`import json`

			`spec = open("emoji-data.txt", "r")`

			`# Regexes`
			`# regex_emoji will return, respectively:`
			`# the code points, its type (status), the actual emoji, and its official name`
			`regex_emoji = r"^([\w ].?\S)\s;\s([\w-]+)\s#\s(.?)\s(\S.).$"`
			`# regex_group returns the group of subgroup that a line opens`
			`regex_group = r"^#\s(group\|subgroup):\s(.*)$"`

			`headers = """`
			`# emoji-char-definitions.zsh - Emoji definitions for oh-my-zsh emoji plugin`
			`#`
			`# This file is auto-generated by update_emoji.py. Do not edit it manually.`
			`#`
			`# This contains the definition for:`
			`# $emoji - which maps character names to Unicode characters`
			`# $emoji_flags - maps country names to Unicode flag characters using region`
			`# indicators`
			`# $emoji_mod - maps modifier components to Unicode characters`
			`# $emoji_groups - a single associative array to avoid cluttering up the`
			`# global namespace, and to allow adding additional group`
			`# definitions at run time. The keys are the group names, and`
			`# the values are whitespace-separated lists of emoji`
			`# character names.`

			`# Main emoji`
			`typeset -gAH emoji`
			`# National flags`
			`typeset -gAH emoji_flags`
			`# Combining modifiers`
			`typeset -gAH emoji_mod`
			`# Emoji groups`
			`typeset -gAH emoji_groups`
			`"""`

			`#######`
			`# Adding country codes`
			`#######`
			`# This is the only part of this script that relies on an external library`
			`# (country_converter), and is hence commented out by default.`
			`# You can uncomment it to have country codes added as aliases for flag`
			`# emojis. (By default, when you install this extension, country codes are`
			`# included as aliases, but not if you re-run this script without uncommenting.)`
			`# Warning: country_converter is very verbose, and will print warnings all over`
			`# your terminal.`

			`# import country_converter as coco # pylint: disable=wrong-import-position`
			`# cc = coco.CountryConverter()`

			`# def country_iso(_all_names, _omz_name):`
			`# """ Using the external library country_converter,`
			`# this funciton can detect the ISO2 and ISO3 codes`
			`# of the country. It takes as argument the array`
			`# with all the names of the emoji, and returns that array."""`
			`# omz_no_underscore = re.sub(r'_', r' ', _omz_name)`
			`# iso2 = cc.convert(names=[omz_no_underscore], to='ISO2')`
			`# if iso2 != 'not found':`
			`# _all_names.append(iso2)`
			`# iso3 = cc.convert(names=[omz_no_underscore], to='ISO3')`
			`# _all_names.append(iso3)`
			`# return _all_names`


			`#######`
			`# Helper functions`
			`#######`

			`def code_to_omz(_code_points):`
			`""" Returns a ZSH-compatible Unicode string from the code point(s) """`
			`return r'\U' + r'\U'.join(_code_points.split(' '))`

			`def name_to_omz(_name, _group, _subgroup, _status):`
			`""" Returns a reasonable snake_case name for the emoji. """`
			`def snake_case(_string):`
			`""" Does the regex work of snake_case """`
			`remove_dots = re.sub(r'\.\(\)', r'', _string)`
			`replace_ands = re.sub(r'\&', r'and', remove_dots)`
			`remove_whitespace = re.sub(r'[^\#\*\w]', r'_', replace_ands)`
			`return re.sub(r'__', r'_', remove_whitespace)`

			`shortname = ""`
			`split_at_colon = lambda s: s.split(": ")`
			`# Special treatment by group and subgroup`
			`# If the emoji is a flag, we strip "flag" from its name`
			`if _group == "Flags" and len(split_at_colon(_name)) > 1:`
			`shortname = snake_case(split_at_colon(_name)[1])`
			`else:`
			`shortname = snake_case(_name)`
			`# Special treatment by status`
			`# Enables us to have every emoji combination,`
			`# even the one that are not officially sanctionned`
			`# and are implemeted by, say, only one vendor`
			`if _status == "unqualified":`
			`shortname += "_unqualified"`
			`elif _status == "minimally-qualified":`
			`shortname += "_minimally"`
			`return shortname`

			`def increment_name(_shortname):`
			`""" Increment the short name by 1. If you get, say,`
			`'woman_detective_unqualified', it returns`
			`'woman_detective_unqualified_1', and then`
			`'woman_detective_unqualified_2', etc. """`
			`last_char = _shortname[-1]`
			`if last_char.isdigit():`
			`num = int(last_char)`
			`return _shortname[:-1] + str(num + 1)`
			`return _shortname + "_1"`

			`########`
			`# Going through every line`
			`########`

			`group, subgroup, short_name_buffer = "", "", ""`
			`emoji_database = []`
			`for line in spec:`
			`# First, test if this line opens a group or subgroup`
			`group_match = re.findall(regex_group, line)`
			`if group_match != []:`
			`gr_or_sub, name = group_match[0]`
			`if gr_or_sub == "group":`
			`group = name`
			`elif gr_or_sub == "subgroup":`
			`subgroup = name`
			`continue # Moving on...`
			`# Second, test if this line references one emoji`
			`emoji_match = re.findall(regex_emoji, line)`
			`if emoji_match != []:`
			`code_points, status, emoji, name = emoji_match[0]`
			`omz_codes = code_to_omz(code_points)`
			`omz_name = name_to_omz(name, group, subgroup, status)`
			`# If this emoji has the same shortname as the preceding one`
			`if omz_name in short_name_buffer:`
			`omz_name = increment_name(short_name_buffer)`
			`short_name_buffer = omz_name`
			`emoji_database.append(`
			`[omz_codes, status, emoji, omz_name, group, subgroup])`
			`spec.close()`

			`########`
			`# Write to emoji-char-definitions.zsh`
			`########`

			`# Aliases for emojis are retrieved through the DB of Gemoji`
			`# Retrieved on Aug 9 2019 from the following URL:`
			`# https://raw.githubusercontent.com/github/gemoji/master/db/emoji.json`

			`gemoji_db = open("gemoji_db.json")`
			`j = json.load(gemoji_db)`
			`aliases_map = {entry['emoji']: entry['aliases'] for entry in j}`
			`all_omz_names = [emoji_data[3] for emoji_data in emoji_database]`

			`# Let's begin writing to this file`
			`output = open("emoji-char-definitions.zsh", "w")`
			`output.write(headers)`

			`emoji_groups = {"fruits": "\n", "vehicles": "\n", "hands": "\n",`
			`"people": "\n", "animals": "\n", "faces": "\n",`
			`"flags": "\n"}`

			`# First, write every emoji down`
			`for _omz_codes, _status, _emoji, _omz_name, _group, _subgroup in emoji_database:`

			`# One emoji can be mapped to multiple names (aliases or country codes)`
			`names_for_this_emoji = [_omz_name]`

			`# Variable that indicates in which map the emoji will be located`
			`emoji_map = "emoji"`
			`if _status == "component":`
			`emoji_map = "emoji_mod"`
			`if _group == "Flags":`
			`emoji_map = "emoji_flags"`
			`# Adding country codes (Optional, see above)`
			`# names_for_this_emoji = country_iso(names_for_this_emoji, _omz_name)`

			`# Check if there is an alias available in the Gemoji DB`
			`if _emoji in aliases_map.keys():`
			`for alias in aliases_map[_emoji]:`
			`if alias not in all_omz_names:`
			`names_for_this_emoji.append(alias)`

			`# And now we write to the definitions file`
			`for one_name in names_for_this_emoji:`
			`output.write(f"{emoji_map}[{one_name}]=$'{_omz_codes}'\n")`

			`# Storing the emoji in defined subgroups for the next step`
			`if _status == "fully-qualified":`
			`if _subgroup == "food-fruit":`
			`emoji_groups["fruits"] += f" {_omz_name}\n"`
			`elif "transport-" in _subgroup:`
			`emoji_groups["vehicles"] += f" {_omz_name}\n"`
			`elif "hand-" in _subgroup:`
			`emoji_groups["hands"] += f" {_omz_name}\n"`
			`elif "person-" in _subgroup or _subgroup == "family":`
			`emoji_groups["people"] += f" {_omz_name}\n"`
			`elif "animal-" in _subgroup:`
			`emoji_groups["animals"] += f" {_omz_name}\n"`
			`elif "face-" in _subgroup:`
			`emoji_groups["faces"] += f" {_omz_name}\n"`
			`elif _group == "Flags":`
			`emoji_groups["flags"] += f" {_omz_name}\n"`

			`# Second, write the subgroups to the end of the file`
			`for name, string in emoji_groups.items():`
			`output.write(f'\nemoji_groups[{name}]="{string}"\n')`
			`output.close()`