SchildiChat-android/tools/import_emojis.py

#!/usr/bin/env python3
from collections import OrderedDict

import requests
import json
import re
import os
from bs4 import BeautifulSoup

# A list of words to not capitalize in emoji-names
capitalization_exclude = {'with', 'a', 'at', 'of', 'for', 'and', 'over', 'the', 'off', 'on', 'out', 'in', 'but', 'or'}

# Create skeleton of the final json file as a python dictionary:
emoji_picker_datasource = {
    "compressed": True,
    "categories": [],
    "emojis": {},
    "aliases": {}
}
emoji_picker_datasource_categories = emoji_picker_datasource["categories"]
emoji_picker_datasource_emojis = emoji_picker_datasource["emojis"]


# Get official emoji list from unicode.org (Emoji List, v13.1 at time of writing)
print("Fetching emoji list from Unicode.org...",)
req = requests.get("https://unicode.org/emoji/charts/emoji-list.html")
soup = BeautifulSoup(req.content, 'html.parser')

# Navigate to table
table = soup.body.table

# Go over all rows
print("Extracting emojis...")
for row in table.find_all('tr'):
    # Add "bigheads"  rows to categories
    if 'bighead' in next(row.children)['class']:
        relevant_element = row.find('a')
        category_id = relevant_element['name']
        category_name = relevant_element.text
        emoji_picker_datasource_categories.append({
            "id": category_id,
            "name": category_name,
            "emojis": []
        })

    # Add information in "rchars" rows to the last encountered category and emojis
    if row.find('td', class_='code'):
        # Get columns
        cols = row.find_all('td')
        no_element = cols[0]
        code_element = cols[1]
        sample_element = cols[2]
        cldr_element = cols[3]
        keywords_element = cols[4]

        # Extract information from columns
        # Extract name and id
        # => Remove spaces, colons and unicode-characters
        emoji_name = cldr_element.text
        emoji_id = cldr_element.text.lower()
        emoji_id = re.sub(r'[^A-Za-z0-9 ]+', '', emoji_id, flags=re.UNICODE)  # Only keep alphanumeric, space characters
        emoji_id = emoji_id.strip()  # Remove leading/trailing whitespaces
        emoji_id = emoji_id.replace(' ', '-')

        # Capitalize name according to the same rules as the previous emoji_picker_datasource.json
        # - Words are separated by any non-word character (\W), e.g. space, comma, parentheses, dots, etc.
        # - Words are capitalized if they are either at the beginning of the name OR not in capitalization_exclude (extracted from the previous datasource, too)
        emoji_name_cap = "".join([w.capitalize() if i == 0 or w not in capitalization_exclude else w for i, w in enumerate(re.split('(\W)', emoji_name))])

        # Extract emoji unicode-codepoint
        emoji_code_raw = code_element.text
        emoji_code_list = emoji_code_raw.split(" ")
        emoji_code_list = [e[2:] for e in emoji_code_list]
        emoji_code = "-".join(emoji_code_list)

        # Extract keywords
        emoji_keywords = keywords_element.text.split(" | ")

        # Add the emoji-id to the last entry in "categories"
        emoji_picker_datasource_categories[-1]["emojis"].append(emoji_id)

        # Add the emoji itself to the "emojis" dict
        emoji_picker_datasource_emojis[emoji_id] = {
                "a": emoji_name_cap,
                "b": emoji_code,
                "j": emoji_keywords
        }

# The keywords of unicode.org are usually quite sparse.
# There is no official specification of keywords beyond that, but muan/emojilib maintains a well maintained and
# established repository with additional keywords. We extend our list with the keywords from there.
# At the time of writing it had additional keyword information for all emojis except a few from the newest unicode 13.1.
print("Fetching additional keywords from Emojilib...")
req = requests.get("https://raw.githubusercontent.com/muan/emojilib/main/dist/emoji-en-US.json")
emojilib_data = json.loads(req.content)

# We just go over all the official emojis from unicode, and add the keywords there
print("Adding keywords to emojis...")
for emoji in emoji_picker_datasource_emojis:
    emoji_name = emoji_picker_datasource_emojis[emoji]["a"]
    emoji_code = emoji_picker_datasource_emojis[emoji]["b"]

    # Convert back to actual unicode emoji
    emoji_unicode = ''.join(map(lambda s: chr(int(s, 16)), emoji_code.split("-")))

    # Search for emoji in emojilib
    if emoji_unicode in emojilib_data:
        emoji_additional_keywords = emojilib_data[emoji_unicode]
    elif emoji_unicode+chr(0xfe0f)  in emojilib_data:
        emoji_additional_keywords = emojilib_data[emoji_unicode+chr(0xfe0f)]
    else:
        print("* No additional keywords for", emoji_unicode, emoji_picker_datasource_emojis[emoji])
        continue

    # If additional keywords exist, add them to emoji_picker_datasource_emojis
    # Avoid duplicates and keep order. Put official unicode.com keywords first and extend up with emojilib ones.
    new_keywords = OrderedDict.fromkeys(emoji_picker_datasource_emojis[emoji]["j"] + emoji_additional_keywords).keys()
    # Remove the ones derived from the unicode name
    new_keywords = new_keywords - {emoji.replace("-", "_")} - {emoji.replace("-", " ")} - {emoji_name}
    # Write new keywords back
    emoji_picker_datasource_emojis[emoji]["j"] = list(new_keywords)

# Filter out components from unicode 13.1 (as they are not suitable for single-emoji reactions)
emoji_picker_datasource['categories'] = [x for x in emoji_picker_datasource['categories'] if x['id'] != 'component']

# Write result to file (overwrite previous), without escaping unicode characters
print("Writing emoji_picker_datasource.json...")
scripts_dir = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(scripts_dir, "../vector/src/main/res/raw/emoji_picker_datasource.json"), "w") as outfile:
    json.dump(emoji_picker_datasource, outfile, ensure_ascii=False, separators=(',', ':'))
print("Done.")
Update emoji import script to ease execution, output smaller .json files and capitalize emoji names 2021-03-26 19:21:02 +03:00			`#!/usr/bin/env python3`
Update import_emojis.py to pull keywords from emojilib, update quick reactions 2021-03-20 00:24:43 +03:00			`from collections import OrderedDict`

Add script to pull emojis from Unicode.org as a file 2021-03-16 18:23:06 +03:00			`import requests`
			`import json`
			`import re`
Update emoji import script to ease execution, output smaller .json files and capitalize emoji names 2021-03-26 19:21:02 +03:00			`import os`
Add script to pull emojis from Unicode.org as a file 2021-03-16 18:23:06 +03:00			`from bs4 import BeautifulSoup`

Update emoji import script to ease execution, output smaller .json files and capitalize emoji names 2021-03-26 19:21:02 +03:00			`# A list of words to not capitalize in emoji-names`
			`capitalization_exclude = {'with', 'a', 'at', 'of', 'for', 'and', 'over', 'the', 'off', 'on', 'out', 'in', 'but', 'or'}`

Add script to pull emojis from Unicode.org as a file 2021-03-16 18:23:06 +03:00			`# Create skeleton of the final json file as a python dictionary:`
			`emoji_picker_datasource = {`
			`"compressed": True,`
			`"categories": [],`
			`"emojis": {},`
			`"aliases": {}`
			`}`
			`emoji_picker_datasource_categories = emoji_picker_datasource["categories"]`
			`emoji_picker_datasource_emojis = emoji_picker_datasource["emojis"]`


			`# Get official emoji list from unicode.org (Emoji List, v13.1 at time of writing)`
Update emoji import script to ease execution, output smaller .json files and capitalize emoji names 2021-03-26 19:21:02 +03:00			`print("Fetching emoji list from Unicode.org...",)`
Add script to pull emojis from Unicode.org as a file 2021-03-16 18:23:06 +03:00			`req = requests.get("https://unicode.org/emoji/charts/emoji-list.html")`
			`soup = BeautifulSoup(req.content, 'html.parser')`

			`# Navigate to table`
			`table = soup.body.table`

			`# Go over all rows`
Update emoji import script to ease execution, output smaller .json files and capitalize emoji names 2021-03-26 19:21:02 +03:00			`print("Extracting emojis...")`
Add script to pull emojis from Unicode.org as a file 2021-03-16 18:23:06 +03:00			`for row in table.find_all('tr'):`
			`# Add "bigheads" rows to categories`
			`if 'bighead' in next(row.children)['class']:`
			`relevant_element = row.find('a')`
			`category_id = relevant_element['name']`
			`category_name = relevant_element.text`
			`emoji_picker_datasource_categories.append({`
			`"id": category_id,`
			`"name": category_name,`
			`"emojis": []`
			`})`

			`# Add information in "rchars" rows to the last encountered category and emojis`
			`if row.find('td', class_='code'):`
			`# Get columns`
			`cols = row.find_all('td')`
			`no_element = cols[0]`
			`code_element = cols[1]`
			`sample_element = cols[2]`
			`cldr_element = cols[3]`
			`keywords_element = cols[4]`

			`# Extract information from columns`
			`# Extract name and id`
			`# => Remove spaces, colons and unicode-characters`
			`emoji_name = cldr_element.text`
			`emoji_id = cldr_element.text.lower()`
			`emoji_id = re.sub(r'[^A-Za-z0-9 ]+', '', emoji_id, flags=re.UNICODE) # Only keep alphanumeric, space characters`
			`emoji_id = emoji_id.strip() # Remove leading/trailing whitespaces`
			`emoji_id = emoji_id.replace(' ', '-')`

Update emoji import script to ease execution, output smaller .json files and capitalize emoji names 2021-03-26 19:21:02 +03:00			`# Capitalize name according to the same rules as the previous emoji_picker_datasource.json`
			`# - Words are separated by any non-word character (\W), e.g. space, comma, parentheses, dots, etc.`
			`# - Words are capitalized if they are either at the beginning of the name OR not in capitalization_exclude (extracted from the previous datasource, too)`
			`emoji_name_cap = "".join([w.capitalize() if i == 0 or w not in capitalization_exclude else w for i, w in enumerate(re.split('(\W)', emoji_name))])`

Add script to pull emojis from Unicode.org as a file 2021-03-16 18:23:06 +03:00			`# Extract emoji unicode-codepoint`
			`emoji_code_raw = code_element.text`
			`emoji_code_list = emoji_code_raw.split(" ")`
			`emoji_code_list = [e[2:] for e in emoji_code_list]`
			`emoji_code = "-".join(emoji_code_list)`

			`# Extract keywords`
			`emoji_keywords = keywords_element.text.split(" \| ")`

			`# Add the emoji-id to the last entry in "categories"`
			`emoji_picker_datasource_categories[-1]["emojis"].append(emoji_id)`

			`# Add the emoji itself to the "emojis" dict`
			`emoji_picker_datasource_emojis[emoji_id] = {`
Update emoji import script to ease execution, output smaller .json files and capitalize emoji names 2021-03-26 19:21:02 +03:00			`"a": emoji_name_cap,`
Add script to pull emojis from Unicode.org as a file 2021-03-16 18:23:06 +03:00			`"b": emoji_code,`
			`"j": emoji_keywords`
			`}`

Update import_emojis.py to pull keywords from emojilib, update quick reactions 2021-03-20 00:24:43 +03:00			`# The keywords of unicode.org are usually quite sparse.`
			`# There is no official specification of keywords beyond that, but muan/emojilib maintains a well maintained and`
			`# established repository with additional keywords. We extend our list with the keywords from there.`
			`# At the time of writing it had additional keyword information for all emojis except a few from the newest unicode 13.1.`
Update emoji import script to ease execution, output smaller .json files and capitalize emoji names 2021-03-26 19:21:02 +03:00			`print("Fetching additional keywords from Emojilib...")`
Update import_emojis.py to pull keywords from emojilib, update quick reactions 2021-03-20 00:24:43 +03:00			`req = requests.get("https://raw.githubusercontent.com/muan/emojilib/main/dist/emoji-en-US.json")`
			`emojilib_data = json.loads(req.content)`

			`# We just go over all the official emojis from unicode, and add the keywords there`
Update emoji import script to ease execution, output smaller .json files and capitalize emoji names 2021-03-26 19:21:02 +03:00			`print("Adding keywords to emojis...")`
Update import_emojis.py to pull keywords from emojilib, update quick reactions 2021-03-20 00:24:43 +03:00			`for emoji in emoji_picker_datasource_emojis:`
			`emoji_name = emoji_picker_datasource_emojis[emoji]["a"]`
			`emoji_code = emoji_picker_datasource_emojis[emoji]["b"]`

			`# Convert back to actual unicode emoji`
			`emoji_unicode = ''.join(map(lambda s: chr(int(s, 16)), emoji_code.split("-")))`

			`# Search for emoji in emojilib`
			`if emoji_unicode in emojilib_data:`
			`emoji_additional_keywords = emojilib_data[emoji_unicode]`
			`elif emoji_unicode+chr(0xfe0f) in emojilib_data:`
			`emoji_additional_keywords = emojilib_data[emoji_unicode+chr(0xfe0f)]`
			`else:`
Update emoji import script to ease execution, output smaller .json files and capitalize emoji names 2021-03-26 19:21:02 +03:00			`print("* No additional keywords for", emoji_unicode, emoji_picker_datasource_emojis[emoji])`
Update import_emojis.py to pull keywords from emojilib, update quick reactions 2021-03-20 00:24:43 +03:00			`continue`

			`# If additional keywords exist, add them to emoji_picker_datasource_emojis`
			`# Avoid duplicates and keep order. Put official unicode.com keywords first and extend up with emojilib ones.`
			`new_keywords = OrderedDict.fromkeys(emoji_picker_datasource_emojis[emoji]["j"] + emoji_additional_keywords).keys()`
			`# Remove the ones derived from the unicode name`
			`new_keywords = new_keywords - {emoji.replace("-", "_")} - {emoji.replace("-", " ")} - {emoji_name}`
			`# Write new keywords back`
			`emoji_picker_datasource_emojis[emoji]["j"] = list(new_keywords)`

			`# Filter out components from unicode 13.1 (as they are not suitable for single-emoji reactions)`
			`emoji_picker_datasource['categories'] = [x for x in emoji_picker_datasource['categories'] if x['id'] != 'component']`

			`# Write result to file (overwrite previous), without escaping unicode characters`
Update emoji import script to ease execution, output smaller .json files and capitalize emoji names 2021-03-26 19:21:02 +03:00			`print("Writing emoji_picker_datasource.json...")`
			`scripts_dir = os.path.dirname(os.path.abspath(__file__))`
			`with open(os.path.join(scripts_dir, "../vector/src/main/res/raw/emoji_picker_datasource.json"), "w") as outfile:`
			`json.dump(emoji_picker_datasource, outfile, ensure_ascii=False, separators=(',', ':'))`
			`print("Done.")`