#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from copy import deepcopy
from itertools import chain
import os
import re
import warnings
from .compat import text_type, callable_check
from .constants import (
PHRASES_DICT, PINYIN_DICT, _INITIALS, PHONETIC_SYMBOL, RE_PHONETIC_SYMBOL,
RE_TONE2, RE_TONE3, RE_HANS, U_FINALS_EXCEPTIONS_MAP,
BOPOMOFO_REPLACE, BOPOMOFO_TABLE,
CYRILLIC_REPLACE, CYRILLIC_TABLE,
NORMAL, TONE, TONE2, TONE3, INITIALS, FIRST_LETTER,
FINALS, FINALS_TONE, FINALS_TONE2, FINALS_TONE3,
BOPOMOFO, BOPOMOFO_FIRST,
CYRILLIC, CYRILLIC_FIRST
)
from .utils import simple_seg, _replace_tone2_style_dict_to_default
def seg(hans):
if getattr(seg, 'no_jieba', None):
ret = hans
return simple_seg(ret)
if seg.jieba is None:
try:
import jieba
seg.jieba = jieba
except ImportError:
seg.no_jieba = True
return seg(hans)
else:
hans = simple_seg(hans)
ret = []
for x in hans:
if not RE_HANS.match(x): # 没有拼音的字符,不再参与二次分词
ret.append(x)
else:
ret.extend(list(seg.jieba.cut(x)))
return ret
seg.jieba = None
if os.environ.get('PYPINYIN_NO_JIEBA'):
seg.no_jieba = True
[文档]def load_single_dict(pinyin_dict, style='default'):
"""载入用户自定义的单字拼音库
:param pinyin_dict: 单字拼音库。比如: ``{0x963F: u"ā,ē"}``
:param style: pinyin_dict 参数值的拼音库风格. 支持 'default', 'tone2'
:type pinyin_dict: dict
"""
if style == 'tone2':
for k, v in pinyin_dict.items():
v = _replace_tone2_style_dict_to_default(v)
PINYIN_DICT[k] = v
else:
PINYIN_DICT.update(pinyin_dict)
[文档]def load_phrases_dict(phrases_dict, style='default'):
"""载入用户自定义的词语拼音库
:param phrases_dict: 词语拼音库。比如: ``{u"阿爸": [[u"ā"], [u"bà"]]}``
:param style: phrases_dict 参数值的拼音库风格. 支持 'default', 'tone2'
:type phrases_dict: dict
"""
if style == 'tone2':
for k, value in phrases_dict.items():
v = [
list(map(_replace_tone2_style_dict_to_default, pys))
for pys in value
]
PHRASES_DICT[k] = v
else:
PHRASES_DICT.update(phrases_dict)
def initial(pinyin):
"""获取单个拼音中的声母.
:param pinyin: 单个拼音
:type pinyin: unicode
:return: 声母
:rtype: unicode
"""
for i in _INITIALS:
if pinyin.startswith(i):
return i
return ''
def final(pinyin):
"""获取单个拼音中的韵母.
:param pinyin: 单个拼音
:type pinyin: unicode
:return: 韵母
:rtype: unicode
"""
initial_ = initial(pinyin) or None
if not initial_:
return no_initial_final(pinyin)
# 特例 j/q/x
m = re.match(r'^(j|q|x)(ū|ú|ǔ|ù)$', pinyin)
if m:
return (U_FINALS_EXCEPTIONS_MAP[m.group(2)])
pinyin = re.sub(r'^(j|q|x)u(\d?)$', r'\1v\2', pinyin)
return ''.join(pinyin.split(initial_, 1))
def no_initial_final(pinyin):
# 特例 y/w
if pinyin.startswith('y'):
if pinyin.startswith('yu'):
pinyin = 'v' + pinyin[2:]
elif pinyin.startswith('yi'):
pinyin = pinyin[1:]
else:
pinyin = 'i' + pinyin[1:]
elif pinyin.startswith('w'):
if pinyin.startswith('wu'):
pinyin = pinyin[1:]
else:
pinyin = 'u' + pinyin[1:]
return pinyin
def to_fixed(pinyin, style):
"""根据拼音风格格式化带声调的拼音.
:param pinyin: 单个拼音
:param style: 拼音风格
:return: 根据拼音风格格式化后的拼音字符串
:rtype: unicode
"""
# 声母
if style == INITIALS:
return initial(pinyin)
def _replace(m):
symbol = m.group(0) # 带声调的字符
# 不包含声调
if style in [NORMAL, FIRST_LETTER, FINALS]:
# 去掉声调: a1 -> a
# 鼻音: 'ḿ', 'ń', 'ň', 'ǹ '
if symbol in ['\u1e3f', '\u0144', '\u0148', '\u01f9']:
return re.sub(r'\d', r'', PHONETIC_SYMBOL[symbol])
else:
return re.sub(RE_TONE2, r'\1', PHONETIC_SYMBOL[symbol])
# 使用数字标识声调
elif style in [TONE2, TONE3, FINALS_TONE2, FINALS_TONE3,
BOPOMOFO, BOPOMOFO_FIRST, CYRILLIC, CYRILLIC_FIRST]:
# 返回使用数字标识声调的字符
return PHONETIC_SYMBOL[symbol]
# 声调在头上
else:
return symbol
# 替换拼音中的带声调字符
py = re.sub(RE_PHONETIC_SYMBOL, _replace, pinyin)
# 将声调移动到最后
if style in [TONE3, FINALS_TONE3,
BOPOMOFO, BOPOMOFO_FIRST,
CYRILLIC, CYRILLIC_FIRST]:
py = RE_TONE3.sub(r'\1\3\2', py)
# 首字母
if style == FIRST_LETTER:
py = py[0]
# 韵母
elif style in [FINALS, FINALS_TONE, FINALS_TONE2, FINALS_TONE3]:
# 不处理鼻音: 'ḿ', 'ń', 'ň', 'ǹ'
if pinyin and pinyin[0] not in [
'\u1e3f', '\u0144', '\u0148', '\u01f9'
]:
py = final(py)
# 声调在拼音之后、注音
elif style in [BOPOMOFO, BOPOMOFO_FIRST]:
# 查表替换成注音
for f, r in BOPOMOFO_REPLACE:
py = f.sub(r, py)
py = ''.join(BOPOMOFO_TABLE.get(x, x) for x in py)
if style == BOPOMOFO_FIRST:
py = py[0]
elif style in [CYRILLIC, CYRILLIC_FIRST]:
# 汉语拼音与俄语字母对照表
for f, r in CYRILLIC_REPLACE:
py = f.sub(r, py)
py = ''.join(CYRILLIC_TABLE.get(x, x) for x in py)
if style == CYRILLIC_FIRST:
py = py[0]
return py
def toFixed(pinyin, style):
warnings.warn(
DeprecationWarning('"toFixed" is deprecated. Use "to_fixed" instead')
)
return to_fixed(pinyin, style)
def _handle_nopinyin_char(chars, errors='default'):
"""处理没有拼音的字符"""
if callable_check(errors):
return errors(chars)
if errors == 'default':
return chars
elif errors == 'ignore':
return None
elif errors == 'replace':
if len(chars) > 1:
return ''.join(text_type('%x' % ord(x)) for x in chars)
else:
return text_type('%x' % ord(chars))
def handle_nopinyin(chars, errors='default'):
py = _handle_nopinyin_char(chars, errors=errors)
if not py:
return []
if isinstance(py, list):
return py
else:
return [py]
def single_pinyin(han, style, heteronym, errors='default'):
"""单字拼音转换.
:param han: 单个汉字
:param errors: 指定如何处理没有拼音的字符,详情请参考
:py:func:`~pypinyin.pinyin`
:return: 返回拼音列表,多音字会有多个拼音项
:rtype: list
"""
num = ord(han)
# 处理没有拼音的字符
if num not in PINYIN_DICT:
return handle_nopinyin(han, errors=errors)
pys = PINYIN_DICT[num].split(',') # 字的拼音列表
if not heteronym:
return [to_fixed(pys[0], style)]
# 输出多音字的多个读音
# 临时存储已存在的拼音,避免多音字拼音转换为非音标风格出现重复。
py_cached = {}
pinyins = []
for i in pys:
py = to_fixed(i, style)
if py in py_cached:
continue
py_cached[py] = py
pinyins.append(py)
return pinyins
def phrase_pinyin(phrase, style, heteronym, errors='default'):
"""词语拼音转换.
:param phrase: 词语
:param errors: 指定如何处理没有拼音的字符
:return: 拼音列表
:rtype: list
"""
py = []
if phrase in PHRASES_DICT:
py = deepcopy(PHRASES_DICT[phrase])
for idx, item in enumerate(py):
py[idx] = [to_fixed(item[0], style=style)]
else:
for i in phrase:
single = single_pinyin(i, style=style, heteronym=heteronym,
errors=errors)
if single:
py.append(single)
return py
def phrases_pinyin(phrases, style, heteronym, errors='default'):
"""词语拼音转换.
:param phrases: 词语
:param errors: 指定如何处理没有拼音的字符
:return: 拼音列表
:rtype: list
"""
warnings.warn(
DeprecationWarning(
'"phrases_pinyin" is deprecated. Use "phrase_pinyin" instead'
)
)
return phrase_pinyin(phrases, style, heteronym, errors=errors)
def _pinyin(words, style, heteronym, errors):
pys = []
# 初步过滤没有拼音的字符
if RE_HANS.match(words):
pys = phrases_pinyin(words, style=style, heteronym=heteronym,
errors=errors)
return pys
for word in simple_seg(words):
if not (RE_HANS.match(word)):
py = handle_nopinyin(word, errors=errors)
pys.append(py) if py else None
else:
pys.extend(_pinyin(word, style, heteronym, errors))
return pys
[文档]def pinyin(hans, style=TONE, heteronym=False, errors='default'):
"""将汉字转换为拼音.
:param hans: 汉字字符串( ``'你好吗'`` )或列表( ``['你好', '吗']`` ).
如果用户安装了 ``jieba`` , 将使用 ``jieba`` 对字符串进行
分词处理。可以通过传入列表的方式禁用这种行为。
也可以使用自己喜爱的分词模块对字符串进行分词处理,
只需将经过分词处理的字符串列表传进来就可以了。
:type hans: unicode 字符串或字符串列表
:param style: 指定拼音风格
:param errors: 指定如何处理没有拼音的字符
* ``'default'``: 保留原始字符
* ``'ignore'``: 忽略该字符
* ``'replace'``: 替换为去掉 ``\\u`` 的 unicode 编码字符串
(``'\\u90aa'`` => ``'90aa'``)
* callable 对象: 回调函数之类的可调用对象。如果 ``erros``
参数 的值是个可调用对象,那么程序会回调这个函数:
``func(char)``::
def foobar(char):
return 'a'
pinyin('あ', errors=foobar)
:param heteronym: 是否启用多音字
:return: 拼音列表
:rtype: list
Usage::
>>> from pypinyin import pinyin
>>> import pypinyin
>>> pinyin('中心')
[['zhōng'], ['xīn']]
>>> pinyin('中心', heteronym=True) # 启用多音字模式
[['zhōng', 'zhòng'], ['xīn']]
>>> pinyin('中心', style=pypinyin.FIRST_LETTER) # 设置拼音风格
[['z'], ['x']]
>>> pinyin('中心', style=pypinyin.TONE2)
[['zho1ng'], ['xi1n']]
>>> pinyin('中心', style=pypinyin.CYRILLIC)
[['чжун1'], ['синь1']]
"""
# 对字符串进行分词处理
if isinstance(hans, text_type):
hans = seg(hans)
pys = []
for words in hans:
pys.extend(_pinyin(words, style, heteronym, errors))
return pys
[文档]def slug(hans, style=NORMAL, heteronym=False, separator='-', errors='default'):
"""生成 slug 字符串.
:param hans: 汉字
:type hans: unicode or list
:param style: 指定拼音风格
:param heteronym: 是否启用多音字
:param separstor: 两个拼音间的分隔符/连接符
:param errors: 指定如何处理没有拼音的字符,详情请参考
:py:func:`~pypinyin.pinyin`
:return: slug 字符串.
::
>>> import pypinyin
>>> pypinyin.slug('中国人')
'zhong-guo-ren'
>>> pypinyin.slug('中国人', separator=' ')
'zhong guo ren'
>>> pypinyin.slug('中国人', style=pypinyin.FIRST_LETTER)
'z-g-r'
>>> pypinyin.slug('中国人', style=pypinyin.CYRILLIC)
'чжун1-го2-жэнь2'
"""
return separator.join(chain(*pinyin(hans, style=style, heteronym=heteronym,
errors=errors)
))
[文档]def lazy_pinyin(hans, style=NORMAL, errors='default'):
"""不包含多音字的拼音列表.
与 :py:func:`~pypinyin.pinyin` 的区别是返回的拼音是个字符串,
并且每个字只包含一个读音.
:param hans: 汉字
:type hans: unicode or list
:param style: 指定拼音风格
:param errors: 指定如何处理没有拼音的字符,详情请参考
:py:func:`~pypinyin.pinyin`
:return: 拼音列表(e.g. ``['zhong', 'guo', 'ren']``)
:rtype: list
Usage::
>>> from pypinyin import lazy_pinyin
>>> import pypinyin
>>> lazy_pinyin('中心')
['zhong', 'xin']
>>> lazy_pinyin('中心', style=pypinyin.TONE)
['zhōng', 'xīn']
>>> lazy_pinyin('中心', style=pypinyin.FIRST_LETTER)
['z', 'x']
>>> lazy_pinyin('中心', style=pypinyin.TONE2)
['zho1ng', 'xi1n']
>>> lazy_pinyin('中心', style=pypinyin.CYRILLIC)
['чжун1', 'синь1']
"""
return list(chain(*pinyin(hans, style=style, heteronym=False,
errors=errors)))