Skip to content

Update emoji data and API #32

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,6 @@ exmoji-*.tar

# Temporary files, for example, from tests.
/tmp/

# Benchmark snapshots
/bench/snapshots
31 changes: 15 additions & 16 deletions bench/exmoji_bench.exs
Original file line number Diff line number Diff line change
@@ -1,24 +1,23 @@
defmodule ExmojiBench do
use Benchfella

bench "all", do: Exmoji.all
bench "all_doublebyte", do: Exmoji.all_doublebyte
bench "all_with_variants", do: Exmoji.all_with_variants
bench "from_unified", do: Exmoji.from_unified("1F680")
bench "chars", do: Exmoji.chars
bench "codepoints", do: Exmoji.codepoints
bench("all", do: Exmoji.all())
bench("all_doublebyte", do: Exmoji.all_doublebyte())
bench("all_with_variants", do: Exmoji.all_with_variants())
bench("from_unified", do: Exmoji.from_unified("1F680"))
bench("chars", do: Exmoji.chars())
bench("codepoints", do: Exmoji.codepoints())

bench "find_by_name - many", do: Exmoji.find_by_name("tree")
bench "find_by_name - none", do: Exmoji.find_by_name("zzzz")
bench("find_by_name - many", do: Exmoji.find_by_name("tree"))
bench("find_by_name - none", do: Exmoji.find_by_name("zzzz"))

bench "find_by_short_name - many", do: Exmoji.find_by_short_name("MOON")
bench "find_by_short_name - none", do: Exmoji.find_by_short_name("zzzz")
bench("find_by_short_name - many", do: Exmoji.find_by_short_name("MOON"))
bench("find_by_short_name - none", do: Exmoji.find_by_short_name("zzzz"))

bench "char_to_unified - single", do: Exmoji.char_to_unified("🚀")
bench "char_to_unified - double", do: Exmoji.char_to_unified("\x{2601}\x{FE0F}")

bench "unified_to_char - single", do: Exmoji.unified_to_char("1F47E")
bench "unified_to_char - double", do: Exmoji.unified_to_char("2764-fe0f")
bench "unified_to_char - triple", do: Exmoji.unified_to_char("0030-FE0F-20E3")
bench("char_to_unified - single", do: Exmoji.Util.char_to_unified("🚀"))
bench("char_to_unified - double", do: Exmoji.Util.char_to_unified("\x2601\xFE0F"))

bench("unified_to_char - single", do: Exmoji.Util.unified_to_char("1F47E"))
bench("unified_to_char - double", do: Exmoji.Util.unified_to_char("2764-fe0f"))
bench("unified_to_char - triple", do: Exmoji.Util.unified_to_char("0030-FE0F-20E3"))
end
47 changes: 1 addition & 46 deletions lib/exmoji.ex
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ defmodule Exmoji do
"""
def find_by_name(name) do
name = String.upcase(name)
Enum.filter(@emoji_chars, &String.contains?(&1.name, name))
Enum.filter(@emoji_chars, &(is_binary(&1.name) && String.contains?(&1.name, name)))
end

@doc """
Expand Down Expand Up @@ -153,49 +153,4 @@ defmodule Exmoji do
end

defp _from_unified(_), do: nil

@doc """
Convert a unified ID directly to its bitstring glyph representation.

## Example

iex> Exmoji.unified_to_char("1F47E")
"👾"

"""
def unified_to_char(uid) do
uid
|> String.split("-")
|> Enum.map(&String.to_integer(&1, 16))
|> List.to_string()
end

@doc """
Convert a native bitstring glyph to its unified codepoint ID.

This is a conversion operation, not a match, so it may produce unexpected
results with different types of values.

## Examples

iex> Exmoji.char_to_unified("👾")
"1F47E"

iex> Exmoji.char_to_unified("\x23\u{fe0f}\u{20e3}")
"0023-FE0F-20E3"

"""
def char_to_unified(char) do
char
|> String.codepoints()
|> Enum.map(&padded_hex_string/1)
|> Enum.join("-")
|> String.upcase()
end

# produce a string representation of the integer value of a codepoint, in hex
# this should be zero-padded to a minimum of 4 digits
defp padded_hex_string(<<cp_int_value::utf8>>) do
cp_int_value |> Integer.to_string(16) |> String.pad_leading(4, "0")
end
end
8 changes: 4 additions & 4 deletions lib/exmoji/emoji_char.ex
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@ defmodule Exmoji.EmojiChar do
def render(ec, options \\ [variant_encoding: true])

def render(ec, variant_encoding: false) do
Exmoji.unified_to_char(ec.unified)
Exmoji.Util.unified_to_char(ec.unified)
end

def render(ec, variant_encoding: true) do
case variant?(ec) do
true -> Exmoji.unified_to_char(variant(ec))
false -> Exmoji.unified_to_char(ec.unified)
true -> Exmoji.Util.unified_to_char(variant(ec))
false -> Exmoji.Util.unified_to_char(ec.unified)
end
end

Expand All @@ -66,7 +66,7 @@ defmodule Exmoji.EmojiChar do
"""
def chars(%EmojiChar{} = emojichar) do
codepoint_ids(emojichar)
|> Enum.map(&Exmoji.unified_to_char/1)
|> Enum.map(&Exmoji.Util.unified_to_char/1)
end

@doc """
Expand Down
9 changes: 7 additions & 2 deletions lib/exmoji/scanner.ex
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ defmodule Exmoji.Scanner do
"""
def scan(str) do
bscan(str)
|> Enum.map(&Exmoji.char_to_unified/1)
|> Enum.map(&Exmoji.Util.char_to_unified/1)
|> Enum.map(&Exmoji.from_unified/1)
end

Expand All @@ -34,8 +34,13 @@ defmodule Exmoji.Scanner do
# new algorithm produces identical results.
#
# Thus it is kept as public so we can compare it in test...
fbs_pattern = Exmoji.chars(include_variants: true) |> Enum.join("|")
fbs_pattern =
Exmoji.chars(include_variants: true)
|> Enum.map(fn emoji_char -> Regex.escape(emoji_char) end)
|> Enum.join("|")

@fbs_regexp Regex.compile!("(?:#{fbs_pattern})")

@doc false
def rscan(str) do
Regex.scan(@fbs_regexp, str)
Expand Down
87 changes: 87 additions & 0 deletions lib/exmoji/util.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
defmodule Exmoji.Util.Unified do
@moduledoc false

# actual conversion function, used by `Exmoji.Util` to generate precompiled
# methods, and also used as a fallback for unmatched values.
def _unified_to_char(uid) do
uid
|> String.split("-")
|> Enum.map(&String.to_integer(&1, 16))
|> List.to_string()
end
end

defmodule Exmoji.Util.Char do
@moduledoc false

# actual conversion function, used by `Exmoji.Util` to generate precompiled
# methods, and also used as a fallback for unmatched values.
def _char_to_unified(char) do
char
|> String.codepoints()
|> Enum.map(&padded_hex_string/1)
|> Enum.join("-")
|> String.upcase()
end

# produce a string representation of the integer value of a codepoint, in hex
# this should be zero-padded to a minimum of 4 digits
defp padded_hex_string(<<cp_int_value::utf8>>) do
cp_int_value |> Integer.to_string(16) |> String.pad_leading(4, "0")
end
end

defmodule Exmoji.Util do
@moduledoc """
Provides utility functions to convert between Unicode unified ID values and
rendered Emoji glyphs in bitstring format.

Pattern matched with precompiled values for all known Emoji character values
for maximum speed, with fallbacks to algorithmic conversion.
"""

alias Exmoji.EmojiChar
alias Exmoji.Util.Unified
alias Exmoji.Util.Char

@doc """
Convert a unified ID directly to its bitstring glyph representation.

Precompiled only for uppercase format of the hex ID.

## Example

iex> Exmoji.Util.unified_to_char("1F47E")
"👾"

"""
for ec <- Exmoji.all(), cp <- EmojiChar.codepoint_ids(ec) do
def unified_to_char(unquote(cp)) do
unquote(Unified._unified_to_char(cp))
end
end

# if not found, fallback
def unified_to_char(uid), do: Unified._unified_to_char(uid)

@doc """
Convert a native bitstring glyph to its unified codepoint ID.

## Examples

iex> Exmoji.Util.char_to_unified("👾")
"1F47E"

iex> Exmoji.Util.char_to_unified("\u0023\uFE0F\u20E3")
"0023-FE0F-20E3"

"""
for ec <- Exmoji.all(), cp <- EmojiChar.codepoint_ids(ec) do
def char_to_unified(unquote(Unified._unified_to_char(cp))) do
unquote(cp)
end
end

# if not found, fallback
def char_to_unified(uid), do: Char._char_to_unified(uid)
end
2 changes: 1 addition & 1 deletion lib/vendor/emoji-data/emoji.json

Large diffs are not rendered by default.

36 changes: 18 additions & 18 deletions test/exmoji_test.exs
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
defmodule ExmojiTest do
use ExUnit.Case, async: true
doctest Exmoji
doctest Exmoji.Util

# Define a number of known Emoji library characteristics.
# We should expect to get this many from our data file.
# This may be manually updated in the future as Emoji evolves.
@known_chars 845
@known_doublebyte 21
@known_variants 107

@known_chars 1300
@known_doublebyte 287
@known_variants 134
#
# #all
#
Expand Down Expand Up @@ -53,7 +53,7 @@ defmodule ExmojiTest do
assert Enum.count(results) == @known_chars

for r <- results do
assert String.match?(r, ~r/^[0-9A-F\-]{4,11}$/)
assert String.match?(r, ~r/^[0-9A-F\-]{4,42}$/)
end
end

Expand All @@ -62,7 +62,7 @@ defmodule ExmojiTest do
assert Enum.count(results) == @known_chars + @known_variants

for r <- results do
assert String.match?(r, ~r/^[0-9A-F\-]{4,16}$/)
assert String.match?(r, ~r/^[0-9A-F\-]{4,42}$/)
end
end

Expand Down Expand Up @@ -130,47 +130,47 @@ defmodule ExmojiTest do
end

test ".from_short_name - returns nil if nothing matches" do
assert Exmoji.from_short_name("taco") == nil
assert Exmoji.from_short_name("nacho") == nil
end

#
# #char_to_unified
#
test ".char_to_unified - converts normal emoji to unified codepoint" do
assert Exmoji.char_to_unified("👾") == "1F47E"
assert Exmoji.char_to_unified("🚀") == "1F680"
assert Exmoji.Util.char_to_unified("👾") == "1F47E"
assert Exmoji.Util.char_to_unified("🚀") == "1F680"
end

test ".char_to_unified - converts double-byte emoji to proper codepoint" do
assert Exmoji.char_to_unified("🇺🇸") == "1F1FA-1F1F8"
assert Exmoji.Util.char_to_unified("🇺🇸") == "1F1FA-1F1F8"
end

test ".char_to_unified - in doublebyte, adds padding to hex codes that are <4 chars" do
assert Exmoji.char_to_unified("#⃣") == "0023-20E3"
assert Exmoji.Util.char_to_unified("#⃣") == "0023-20E3"
end

test ".char_to_unified - converts variant encoded emoji to variant unified codepoint" do
assert Exmoji.char_to_unified("\u{2601}\u{FE0F}") == "2601-FE0F"
assert Exmoji.Util.char_to_unified("\u{2601}\u{FE0F}") == "2601-FE0F"
end

#
# #unified_to_char
#
test ".unified_to_char - converts normal unified codepoints to unicode strings" do
assert Exmoji.unified_to_char("1F47E") == "👾"
assert Exmoji.unified_to_char("1F680") == "🚀"
assert Exmoji.Util.unified_to_char("1F47E") == "👾"
assert Exmoji.Util.unified_to_char("1F680") == "🚀"
end

test ".unified_to_char - converts doublebyte unified codepoints to unicode strings" do
assert Exmoji.unified_to_char("1F1FA-1F1F8") == "🇺🇸"
assert Exmoji.unified_to_char("0023-20E3") == "#⃣"
assert Exmoji.Util.unified_to_char("1F1FA-1F1F8") == "🇺🇸"
assert Exmoji.Util.unified_to_char("0023-20E3") == "#⃣"
end

test ".unified_to_char - converts variant unified codepoints to unicode strings" do
assert Exmoji.unified_to_char("2764-fe0f") == "\u{2764}\u{FE0F}"
assert Exmoji.Util.unified_to_char("2764-fe0f") == "\u{2764}\u{FE0F}"
end

test ".unified_to_char - converts variant+doublebyte chars (triplets!) to unicode strings" do
assert Exmoji.unified_to_char("0030-FE0F-20E3") == "\u{0030}\u{FE0F}\u{20E3}"
assert Exmoji.Util.unified_to_char("0030-FE0F-20E3") == "\u{0030}\u{FE0F}\u{20E3}"
end
end