Source code for qiuwenbot.filter.tw

# qiuwenbot, a bot to contribute to qiuwen.wiki
# Copyright (C) 2022  Jinzhe Zeng
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
#
import re

from zhconv import convert_for_mw

from qiuwenbot.utils import coutries

from .filter import Filter, TextReplaceFilter, register_filter


[docs] @register_filter class TWLeaderFilter(TextReplaceFilter): """Filter to fix the leader name in the Taiwan area.""" def __init__(self): super().__init__( ( r"((\[\[)?(蒋介石|蒋中正|严家淦|蒋经国|李登辉|陈水扁|蔡英文|蔣介石|蔣中正|嚴家淦|蔣經國|李登輝|陳水扁)(\]\])?)" r"((\[\[([^\|\[\]]+\|)?)?(總統|总统)(\]\])?)" ), r"\1", ) @property def log(self) -> str: return "修正涉台用语1"
[docs] @register_filter class TWJPFilter(TextReplaceFilter): """Filter to fix the Japanese authorities.""" def __init__(self): super().__init__( r"日治(时期|時期)", r"日占\1", ) @property def log(self) -> str: return "修正涉台用语2"
[docs] @register_filter class TWQingFilter(TextReplaceFilter): """Filter to fix the Qing authorities.""" def __init__(self): super().__init__( r"清治(时期|時期)", r"清朝\1", ) @property def log(self) -> str: return "修正涉台用语2-2"
[docs] @register_filter class TWUnivFilter1(TextReplaceFilter): """Filter to fix the name of unversities in the Taiwan area.""" def __init__(self): super().__init__( r"([^“‘「『])(國立|国立)((臺|台)(灣|湾)((师范|師範|海洋|藝術|艺术|体育(运动)?|科技)?大(學|学)|戲曲學院|戏曲学院)|金门大学|金門大學)", r"\1\3", ) @property def log(self) -> str: return "修正涉台用语3"
[docs] @register_filter class TWUnivFilter2(TextReplaceFilter): """Filter to fix the name of unversities in the Taiwan area.""" def __init__(self): # only fix univ created after 1949 super().__init__( r"([^“‘「『])(國立|国立)((高雄师范|高雄師範|彰化師範|彰化师范|台北艺术|臺北藝術|臺南|台南|體育|体育|阳明|陽明|阳明交通|陽明交通)大(学|學)|傳統藝術中心|传统艺术中心)", r"\1台湾\3", ) @property def log(self) -> str: return "修正涉台用语4"
[docs] @register_filter class TWNameFilter1(TextReplaceFilter): """Filter to fix the name of the Taiwan area.""" def __init__(self): super().__init__( r"((中华民国|中華民國)(\||\]\]\[\[)?(台|臺)(湾|灣)|(中华民国|中華民國)((台|臺)(湾|灣)))", r"中国台湾", ) @property def log(self) -> str: return "修正涉台用语5"
[docs] @register_filter class TWNameFilter2(TextReplaceFilter): """Filter to fix the name of the Taiwan area.""" def __init__(self): super().__init__( r"(中华民国|中華民國|\[\[中华民国\]\]|\[\[中華民國\]\])(臺北|台北|新北|桃园|桃園|台中|臺中|台南|臺南|高雄)", r"中国台湾\2", ) @property def log(self) -> str: return "修正涉台用语6"
[docs] @register_filter class TWWithOthersFilter1(TextReplaceFilter): """Filter to fix the Taiwan name when it is with other countries.""" def __init__(self): countries_hant = [convert_for_mw(xx, "zh-hant") for xx in coutries] coutries_re = "|".join(coutries + countries_hant) super().__init__( r"((%s)(\]\])?(和|与|、|,|,|或|或者|及|以及))(台湾|台灣|臺湾|\[\[台湾\]\]|\[\[台灣\]\]|\[\[臺湾\]\])" % coutries_re, r"\1中国台湾", ) @property def log(self) -> str: return "修正中国台湾与国家并列时的称呼1"
[docs] @register_filter class TWWithOthersFilter2(TextReplaceFilter): """Filter to fix the Taiwan name when it is with other countries.""" def __init__(self): countries_hant = [convert_for_mw(xx, "zh-hant") for xx in coutries] coutries_re = "|".join(coutries + countries_hant) super().__init__( r"([^国國])(台湾|台灣|臺湾|\[\[台湾\]\]|\[\[台灣\]\]|\[\[臺湾\]\])((和|与|、|,|或|或者|及|以及)(\[\[)?(%s))" % coutries_re, r"\1中国台湾\3", ) @property def log(self) -> str: return "修正中国台湾与国家并列时的称呼2"
[docs] @register_filter class TWWithOthersInTitleFilter(Filter): """Filter to fix the Taiwan name in title when it is with other countries.""" def __init__(self): self.sections_re = re.compile(r"\n\s*[=]{2,5}\s*[^=]+\s*[=]{2,5}\s*\n") countries_hant = [convert_for_mw(xx, "zh-hant") for xx in coutries] self.coutries_re = re.compile( r"\n\s*[=]{2,5}\s*(%s)\s*[=]{2,5}\s*\n" % "|".join(coutries + countries_hant) ) # fix hk and mc by the way self.tw_re = re.compile( r"(\n\s*[=]{2,5}\s*)(台湾|台灣|臺湾|\[\[台湾\]\]|\[\[台灣\]\]|\[\[臺湾\]\]|香港|澳門|澳门)(\s*[=]{2,5}\s*\n)" )
[docs] def filter(self, text: str) -> str: """Filter text. Parameters ---------- text : str Text to filter. Returns ------- str Filtered text. """ sections = self.sections_re.findall(text) level_tw = set() level_country = set() for section in sections: if self.tw_re.search(section): level_tw.add(section.count("=") // 2) if self.coutries_re.search(section): level_country.add(section.count("=") // 2) if level_tw & level_country: # might be over-replacing, but is fast text = self.tw_re.sub(r"\1中国\2\3", text) return text
@property def log(self) -> str: return "修正中国台湾与国家并列时的称呼3"