Skip to content

Commit

Permalink
ENH Jyutping-to-IPA conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
jacksonllee committed May 23, 2024
1 parent 7d10d44 commit 2b10ad0
Show file tree
Hide file tree
Showing 9 changed files with 251 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/).
## [Unreleased]

### Added
- Added the new function `jyutping_to_ipa` for Jyutping-to-IPA conversion.
- The `characters_to_jyutping` function can now take a list of strings as input
with user-provided word segmentation.
- Added support for Python 3.11 and 3.12.
Expand Down
1 change: 1 addition & 0 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Jyutping Romanization

characters_to_jyutping
parse_jyutping
jyutping_to_ipa
jyutping_to_yale
jyutping_to_tipa

Expand Down
2 changes: 1 addition & 1 deletion docs/source/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ available from CHILDES and TalkBank (in alphabetical order):
>>> corpus.n_files()
161
>>> len(corpus.words())
1177307
1177971
* `Leo Corpus <https://childes.talkbank.org/access/Biling/Leo.html>`_

Expand Down
34 changes: 33 additions & 1 deletion docs/source/jyutping.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ involve the processing of `Jyutping romanization
<https://www.lshk.org/jyutping>`_.
A common need is to convert Cantonese characters to Jyutping romanization.
Another functionality of interest is the ability to convert Jyutping into
other romanization schemes still used today.
IPA or another romanization scheme.
Whether you have :ref:`data in Jyutping from a corpus reader<jyutping_from_reader>`
or you have independently ingested Jyutping as Python strings,
PyCantonese provides tools for these use cases.
Expand Down Expand Up @@ -147,6 +147,38 @@ and final (= nucleus + coda; 韻母):
'yut'
Jyutping-to-IPA Conversion
--------------------------

:func:`~pycantonese.jyutping_to_ipa` converts Jyutping into IPA
(International Phonetic Alphabet), the standard representation of speech sounds
in phonetics and phonology:

.. code-block:: python
>>> import pycantonese
>>> pycantonese.jyutping_to_ipa('gwong2dung1waa2') # 廣東話 Cantonese
['kʷɔŋ25', 'tʊŋ55', 'waː25']
>>> pycantonese.jyutping_to_ipa('gwong2dung1waa2', as_list=False)
'kʷɔŋ25 tʊŋ55 waː25'
The mapping from Jyutping to IPA symbols is based on Matthews and Yip (2011: 461-463).
If you'd like to customize the mapping of specific symbols,
:func:`~pycantonese.jyutping_to_ipa` accepts keyword arguments
``onsets``, ``nuclei``, ``codas``, and ``tones``, each of which
takes a dictionary that maps a Jyutping sound to your desired symbol:

.. code-block:: python
>>> import pycantonese
>>> pycantonese.jyutping_to_ipa('ci1')
['tsʰi55']
>>> pycantonese.jyutping_to_ipa('ci1', onsets={'c': "tʃ'"})
["tʃ'i55"]
>>> pycantonese.jyutping_to_ipa('ci1', tones={'1': "˥"})
['tsʰi˥']
Jyutping-to-Yale Conversion
---------------------------

Expand Down
8 changes: 3 additions & 5 deletions src/pycantonese/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
try:
from importlib.metadata import version
except ModuleNotFoundError:
# For Python < 3.8
from importlib_metadata import version
from importlib.metadata import version

from pycantonese.corpus import hkcancor, read_chat, CHATReader
from pycantonese.jyutping.characters import (
characters_to_jyutping,
characters2jyutping,
)
from pycantonese.jyutping.parse_jyutping import parse_jyutping
from pycantonese.jyutping.ipa import jyutping_to_ipa
from pycantonese.jyutping.tipa import jyutping_to_tipa, jyutping2tipa
from pycantonese.jyutping.yale import jyutping_to_yale, jyutping2yale
from pycantonese.pos_tagging.tagger import pos_tag
Expand All @@ -27,6 +24,7 @@
"characters_to_jyutping",
"characters2jyutping",
"hkcancor",
"jyutping_to_ipa",
"jyutping_to_tipa",
"jyutping_to_yale",
"jyutping2tipa",
Expand Down
164 changes: 164 additions & 0 deletions src/pycantonese/jyutping/ipa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
from functools import lru_cache
from typing import Dict, List, Optional, Union

from .parse_jyutping import parse_jyutping


_ONSETS = {
"b": "p",
"d": "t",
"g": "k",
"gw": "kʷ",
"z": "ts",
"p": "pʰ",
"t": "tʰ",
"k": "kʰ",
"kw": "kʷʰ",
"c": "tsʰ",
"m": "m",
"n": "n",
"ng": "ŋ",
"f": "f",
"h": "h",
"s": "s",
"l": "l",
"w": "w",
"j": "j",
"": "",
}

_NUCLEI = {
"aa": "aː",
"a": "ɐ",
"i": "i", # ɪ before ng, k
"yu": "y",
"u": "u", # ʊ before ng, k
"oe": "œ",
"e": "ɛ", # e before i
"eo": "ɵ",
"o": "ɔ", # o before u
"m": "m",
"n": "n",
"ng": "ŋ",
}

_CODAS = {
"p": "p̚",
"t": "t̚",
"k": "k̚",
"m": "m",
"n": "n",
"ng": "ŋ",
"i": "i", # y after eo, u, o
"u": "u",
"": "",
}

_TONES = {
"1": "55",
"2": "25",
"3": "33",
"4": "21",
"5": "23",
"6": "22",
}


@lru_cache
def _replace(current, parsed, part_to_match, matches, default):
if getattr(parsed, part_to_match) in matches:
return default
else:
return current


def jyutping_to_ipa(
jp_str: str,
as_list: bool = True,
*,
onsets: Optional[Dict[str, str]] = None,
nuclei: Optional[Dict[str, str]] = None,
codas: Optional[Dict[str, str]] = None,
tones: Optional[Dict[str, str]] = None,
) -> Union[List[str], str]:
"""Convert Jyutping romanization into IPA.
The Jyutping-to-IPA mapping is based on Matthews and Yip (2011: 461-463).
Parameters
----------
jp_str : str
Jyutping romanization for one or multiple characters
as_list : bool, optional
If ``True`` (the default), the returned value is a list of strings
where each string is the IPA representation of each Cantonese / Chinese
character based on the input Jyutping.
onsets : dict[str, str], optional
If provided, it must be a dictionary that maps Jyutping onsets to
the desired IPA symbols for customization. For example, Jyutping "z"
maps to IPA /ts/ by default. Passing in ``{"z": "tʃ"}`` would map
"z" to /tʃ/ instead.
nuclei : dict[str, str], optional
If provided, it must be a dictionary that maps Jyutping nuclei to
the desired IPA symbols for customization. For example, Jyutping "i"
maps to IPA /i/ by default. Passing in ``{"i": "iː"}`` would map
"i" to /iː/ instead.
codas : dict[str, str], optional
If provided, it must be a dictionary that maps Jyutping codas to
the desired IPA symbols for customization. For example, Jyutping "p"
maps to IPA /p̚/ by default. Passing in ``{"p": "p"}`` would map
"p" to /p/ instead.
tones : dict[str, str], optional
If provided, it must be a dictionary that maps Jyutping tones to
the desired IPA symbols for customization. For example, Jyutping "2"
(high-rising tone)
maps to IPA /25/ by default. Passing in ``{"2": "35"}`` would map
Jyutping "2" to /35/ instead.
Returns
-------
list[str] | str
Examples
--------
>>> jyutping_to_ipa('gwong2dung1waa2') # 廣東話 Cantonese
['kʷɔŋ25', 'tʊŋ55', 'waː25']
>>> jyutping_to_ipa('gwong2dung1waa2', as_list=False)
'kʷɔŋ25 tʊŋ55 waː25'
>>> jyutping_to_ipa('ci1', onsets={'c': "tʃ'"})
["tʃ'i55"]
>>> jyutping_to_ipa('ci1', tones={'1': "˥"})
['tsʰi˥']
"""
jp_parsed_list = parse_jyutping(jp_str)
ipa_list = []

for jp_parsed in jp_parsed_list:
onset = _ONSETS[jp_parsed.onset]
nucleus = _NUCLEI[jp_parsed.nucleus]
coda = _CODAS[jp_parsed.coda]
tone = _TONES[jp_parsed.tone]

if (n := jp_parsed.nucleus) == "i":
nucleus = _replace(nucleus, jp_parsed, "coda", ("ng", "k"), "ɪ")
elif n == "u":
nucleus = _replace(nucleus, jp_parsed, "coda", ("ng", "k"), "ʊ")
elif n == "e":
nucleus = _replace(nucleus, jp_parsed, "coda", ("i",), "e")
elif n == "o":
nucleus = _replace(nucleus, jp_parsed, "coda", ("u",), "o")

if jp_parsed.coda == "i":
coda = _replace(coda, jp_parsed, "nucleus", ("eo", "u", "o"), "y")

onset = (onsets or {}).get(jp_parsed.onset, onset)
nucleus = (nuclei or {}).get(jp_parsed.nucleus, nucleus)
coda = (codas or {}).get(jp_parsed.coda, coda)
tone = (tones or {}).get(jp_parsed.tone, tone)

ipa_list.append(onset + nucleus + coda + tone)

if as_list:
return ipa_list
else:
return " ".join(ipa_list)
3 changes: 3 additions & 0 deletions src/pycantonese/jyutping/parse_jyutping.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ def __str__(self):
"""Combine onset + nucleus + coda + tone."""
return f"{self.onset}{self.nucleus}{self.coda}{self.tone}"

def __hash__(self):
return hash(self.__str__())

@property
def final(self):
"""Return the final (= nucleus + coda)."""
Expand Down
2 changes: 2 additions & 0 deletions src/pycantonese/jyutping/yale.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unicodedata
from functools import lru_cache

from pycantonese.jyutping.parse_jyutping import parse_jyutping
from pycantonese.util import _deprecate
Expand Down Expand Up @@ -54,6 +55,7 @@
}


@lru_cache
def jyutping_to_yale(jp_str, as_list=True):
"""Convert Jyutping romanization into Yale romanization.
Expand Down
43 changes: 43 additions & 0 deletions tests/test_jyutping/test_ipa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pytest

from pycantonese.jyutping.ipa import jyutping_to_ipa


@pytest.mark.parametrize(
"jp_str, expected",
[
("taa1", "tʰaː55"),
("zi1", "tsi55"),
("ging6", "kɪŋ22"),
("wu4", "wu21"),
("puk1", "pʰʊk̚55"),
("je5", "jɛ23"),
("sei3", "sei33"),
("ngo5", "ŋɔ23"),
("mou2", "mou25"),
("gui6", "kuy22"),
("baau3", "paːu33"),
("ngau4", "ŋɐu21"),
("syu1", "sy55"),
("goeng1", "kœŋ55"),
("geok3", "kɵk̚33"),
],
)
def test_jyutping_to_ipa__base_cases(jp_str, expected):
assert jyutping_to_ipa(jp_str, as_list=False) == expected


def test_jyutping_to_ipa__custom_onsets():
assert jyutping_to_ipa("ci1", as_list=False, onsets={"c": "tʃ'"}) == "tʃ'i55"


def test_jyutping_to_ipa__custom_nuclei():
assert jyutping_to_ipa("ci1", as_list=False, nuclei={"i": "iː"}) == "tsʰiː55"


def test_jyutping_to_ipa__custom_tones():
assert jyutping_to_ipa("ci2", as_list=False, tones={"2": "35"}) == "tsʰi35"


def test_jyutping_to_ipa__custom_codas():
assert jyutping_to_ipa("sip3", as_list=False, codas={"p": "p"}) == "sip33"

0 comments on commit 2b10ad0

Please sign in to comment.