test
This commit is contained in:
		| @ -0,0 +1,351 @@ | ||||
| """ | ||||
| Metadata about languages used by our model training code for our | ||||
| SingleByteCharSetProbers.  Could be used for other things in the future. | ||||
|  | ||||
| This code is based on the language metadata from the uchardet project. | ||||
| """ | ||||
|  | ||||
| from string import ascii_letters | ||||
|  | ||||
| # TODO: Add Ukrainian (KOI8-U) | ||||
|  | ||||
|  | ||||
| class Language: | ||||
|     """Metadata about a language useful for training models | ||||
|  | ||||
|     :ivar name: The human name for the language, in English. | ||||
|     :type name: str | ||||
|     :ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, | ||||
|                     or use another catalog as a last resort. | ||||
|     :type iso_code: str | ||||
|     :ivar use_ascii: Whether or not ASCII letters should be included in trained | ||||
|                      models. | ||||
|     :type use_ascii: bool | ||||
|     :ivar charsets: The charsets we want to support and create data for. | ||||
|     :type charsets: list of str | ||||
|     :ivar alphabet: The characters in the language's alphabet. If `use_ascii` is | ||||
|                     `True`, you only need to add those not in the ASCII set. | ||||
|     :type alphabet: str | ||||
|     :ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling | ||||
|                             Wikipedia for training data. | ||||
|     :type wiki_start_pages: list of str | ||||
|     """ | ||||
|  | ||||
|     def __init__( | ||||
|         self, | ||||
|         name=None, | ||||
|         iso_code=None, | ||||
|         use_ascii=True, | ||||
|         charsets=None, | ||||
|         alphabet=None, | ||||
|         wiki_start_pages=None, | ||||
|     ): | ||||
|         super().__init__() | ||||
|         self.name = name | ||||
|         self.iso_code = iso_code | ||||
|         self.use_ascii = use_ascii | ||||
|         self.charsets = charsets | ||||
|         if self.use_ascii: | ||||
|             if alphabet: | ||||
|                 alphabet += ascii_letters | ||||
|             else: | ||||
|                 alphabet = ascii_letters | ||||
|         elif not alphabet: | ||||
|             raise ValueError("Must supply alphabet if use_ascii is False") | ||||
|         self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None | ||||
|         self.wiki_start_pages = wiki_start_pages | ||||
|  | ||||
|     def __repr__(self): | ||||
|         param_str = ", ".join( | ||||
|             f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_") | ||||
|         ) | ||||
|         return f"{self.__class__.__name__}({param_str})" | ||||
|  | ||||
|  | ||||
| LANGUAGES = { | ||||
|     "Arabic": Language( | ||||
|         name="Arabic", | ||||
|         iso_code="ar", | ||||
|         use_ascii=False, | ||||
|         # We only support encodings that use isolated | ||||
|         # forms, because the current recommendation is | ||||
|         # that the rendering system handles presentation | ||||
|         # forms. This means we purposefully skip IBM864. | ||||
|         charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"], | ||||
|         alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ", | ||||
|         wiki_start_pages=["الصفحة_الرئيسية"], | ||||
|     ), | ||||
|     "Belarusian": Language( | ||||
|         name="Belarusian", | ||||
|         iso_code="be", | ||||
|         use_ascii=False, | ||||
|         charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"], | ||||
|         alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ", | ||||
|         wiki_start_pages=["Галоўная_старонка"], | ||||
|     ), | ||||
|     "Bulgarian": Language( | ||||
|         name="Bulgarian", | ||||
|         iso_code="bg", | ||||
|         use_ascii=False, | ||||
|         charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"], | ||||
|         alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя", | ||||
|         wiki_start_pages=["Начална_страница"], | ||||
|     ), | ||||
|     "Czech": Language( | ||||
|         name="Czech", | ||||
|         iso_code="cz", | ||||
|         use_ascii=True, | ||||
|         charsets=["ISO-8859-2", "WINDOWS-1250"], | ||||
|         alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ", | ||||
|         wiki_start_pages=["Hlavní_strana"], | ||||
|     ), | ||||
|     "Danish": Language( | ||||
|         name="Danish", | ||||
|         iso_code="da", | ||||
|         use_ascii=True, | ||||
|         charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], | ||||
|         alphabet="æøåÆØÅ", | ||||
|         wiki_start_pages=["Forside"], | ||||
|     ), | ||||
|     "German": Language( | ||||
|         name="German", | ||||
|         iso_code="de", | ||||
|         use_ascii=True, | ||||
|         charsets=["ISO-8859-1", "WINDOWS-1252"], | ||||
|         alphabet="äöüßÄÖÜ", | ||||
|         wiki_start_pages=["Wikipedia:Hauptseite"], | ||||
|     ), | ||||
|     "Greek": Language( | ||||
|         name="Greek", | ||||
|         iso_code="el", | ||||
|         use_ascii=False, | ||||
|         charsets=["ISO-8859-7", "WINDOWS-1253"], | ||||
|         alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ", | ||||
|         wiki_start_pages=["Πύλη:Κύρια"], | ||||
|     ), | ||||
|     "English": Language( | ||||
|         name="English", | ||||
|         iso_code="en", | ||||
|         use_ascii=True, | ||||
|         charsets=["ISO-8859-1", "WINDOWS-1252"], | ||||
|         wiki_start_pages=["Main_Page"], | ||||
|     ), | ||||
|     "Esperanto": Language( | ||||
|         name="Esperanto", | ||||
|         iso_code="eo", | ||||
|         # Q, W, X, and Y not used at all | ||||
|         use_ascii=False, | ||||
|         charsets=["ISO-8859-3"], | ||||
|         alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ", | ||||
|         wiki_start_pages=["Vikipedio:Ĉefpaĝo"], | ||||
|     ), | ||||
|     "Spanish": Language( | ||||
|         name="Spanish", | ||||
|         iso_code="es", | ||||
|         use_ascii=True, | ||||
|         charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], | ||||
|         alphabet="ñáéíóúüÑÁÉÍÓÚÜ", | ||||
|         wiki_start_pages=["Wikipedia:Portada"], | ||||
|     ), | ||||
|     "Estonian": Language( | ||||
|         name="Estonian", | ||||
|         iso_code="et", | ||||
|         use_ascii=False, | ||||
|         charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"], | ||||
|         # C, F, Š, Q, W, X, Y, Z, Ž are only for | ||||
|         # loanwords | ||||
|         alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü", | ||||
|         wiki_start_pages=["Esileht"], | ||||
|     ), | ||||
|     "Finnish": Language( | ||||
|         name="Finnish", | ||||
|         iso_code="fi", | ||||
|         use_ascii=True, | ||||
|         charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], | ||||
|         alphabet="ÅÄÖŠŽåäöšž", | ||||
|         wiki_start_pages=["Wikipedia:Etusivu"], | ||||
|     ), | ||||
|     "French": Language( | ||||
|         name="French", | ||||
|         iso_code="fr", | ||||
|         use_ascii=True, | ||||
|         charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], | ||||
|         alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ", | ||||
|         wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"], | ||||
|     ), | ||||
|     "Hebrew": Language( | ||||
|         name="Hebrew", | ||||
|         iso_code="he", | ||||
|         use_ascii=False, | ||||
|         charsets=["ISO-8859-8", "WINDOWS-1255"], | ||||
|         alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ", | ||||
|         wiki_start_pages=["עמוד_ראשי"], | ||||
|     ), | ||||
|     "Croatian": Language( | ||||
|         name="Croatian", | ||||
|         iso_code="hr", | ||||
|         # Q, W, X, Y are only used for foreign words. | ||||
|         use_ascii=False, | ||||
|         charsets=["ISO-8859-2", "WINDOWS-1250"], | ||||
|         alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ", | ||||
|         wiki_start_pages=["Glavna_stranica"], | ||||
|     ), | ||||
|     "Hungarian": Language( | ||||
|         name="Hungarian", | ||||
|         iso_code="hu", | ||||
|         # Q, W, X, Y are only used for foreign words. | ||||
|         use_ascii=False, | ||||
|         charsets=["ISO-8859-2", "WINDOWS-1250"], | ||||
|         alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ", | ||||
|         wiki_start_pages=["Kezdőlap"], | ||||
|     ), | ||||
|     "Italian": Language( | ||||
|         name="Italian", | ||||
|         iso_code="it", | ||||
|         use_ascii=True, | ||||
|         charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], | ||||
|         alphabet="ÀÈÉÌÒÓÙàèéìòóù", | ||||
|         wiki_start_pages=["Pagina_principale"], | ||||
|     ), | ||||
|     "Lithuanian": Language( | ||||
|         name="Lithuanian", | ||||
|         iso_code="lt", | ||||
|         use_ascii=False, | ||||
|         charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"], | ||||
|         # Q, W, and X not used at all | ||||
|         alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž", | ||||
|         wiki_start_pages=["Pagrindinis_puslapis"], | ||||
|     ), | ||||
|     "Latvian": Language( | ||||
|         name="Latvian", | ||||
|         iso_code="lv", | ||||
|         use_ascii=False, | ||||
|         charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"], | ||||
|         # Q, W, X, Y are only for loanwords | ||||
|         alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž", | ||||
|         wiki_start_pages=["Sākumlapa"], | ||||
|     ), | ||||
|     "Macedonian": Language( | ||||
|         name="Macedonian", | ||||
|         iso_code="mk", | ||||
|         use_ascii=False, | ||||
|         charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"], | ||||
|         alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш", | ||||
|         wiki_start_pages=["Главна_страница"], | ||||
|     ), | ||||
|     "Dutch": Language( | ||||
|         name="Dutch", | ||||
|         iso_code="nl", | ||||
|         use_ascii=True, | ||||
|         charsets=["ISO-8859-1", "WINDOWS-1252"], | ||||
|         wiki_start_pages=["Hoofdpagina"], | ||||
|     ), | ||||
|     "Polish": Language( | ||||
|         name="Polish", | ||||
|         iso_code="pl", | ||||
|         # Q and X are only used for foreign words. | ||||
|         use_ascii=False, | ||||
|         charsets=["ISO-8859-2", "WINDOWS-1250"], | ||||
|         alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż", | ||||
|         wiki_start_pages=["Wikipedia:Strona_główna"], | ||||
|     ), | ||||
|     "Portuguese": Language( | ||||
|         name="Portuguese", | ||||
|         iso_code="pt", | ||||
|         use_ascii=True, | ||||
|         charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], | ||||
|         alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú", | ||||
|         wiki_start_pages=["Wikipédia:Página_principal"], | ||||
|     ), | ||||
|     "Romanian": Language( | ||||
|         name="Romanian", | ||||
|         iso_code="ro", | ||||
|         use_ascii=True, | ||||
|         charsets=["ISO-8859-2", "WINDOWS-1250"], | ||||
|         alphabet="ăâîșțĂÂÎȘȚ", | ||||
|         wiki_start_pages=["Pagina_principală"], | ||||
|     ), | ||||
|     "Russian": Language( | ||||
|         name="Russian", | ||||
|         iso_code="ru", | ||||
|         use_ascii=False, | ||||
|         charsets=[ | ||||
|             "ISO-8859-5", | ||||
|             "WINDOWS-1251", | ||||
|             "KOI8-R", | ||||
|             "MacCyrillic", | ||||
|             "IBM866", | ||||
|             "IBM855", | ||||
|         ], | ||||
|         alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", | ||||
|         wiki_start_pages=["Заглавная_страница"], | ||||
|     ), | ||||
|     "Slovak": Language( | ||||
|         name="Slovak", | ||||
|         iso_code="sk", | ||||
|         use_ascii=True, | ||||
|         charsets=["ISO-8859-2", "WINDOWS-1250"], | ||||
|         alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ", | ||||
|         wiki_start_pages=["Hlavná_stránka"], | ||||
|     ), | ||||
|     "Slovene": Language( | ||||
|         name="Slovene", | ||||
|         iso_code="sl", | ||||
|         # Q, W, X, Y are only used for foreign words. | ||||
|         use_ascii=False, | ||||
|         charsets=["ISO-8859-2", "WINDOWS-1250"], | ||||
|         alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ", | ||||
|         wiki_start_pages=["Glavna_stran"], | ||||
|     ), | ||||
|     # Serbian can be written in both Latin and Cyrillic, but there's no | ||||
|     # simple way to get the Latin alphabet pages from Wikipedia through | ||||
|     # the API, so for now we just support Cyrillic. | ||||
|     "Serbian": Language( | ||||
|         name="Serbian", | ||||
|         iso_code="sr", | ||||
|         alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш", | ||||
|         charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"], | ||||
|         wiki_start_pages=["Главна_страна"], | ||||
|     ), | ||||
|     "Thai": Language( | ||||
|         name="Thai", | ||||
|         iso_code="th", | ||||
|         use_ascii=False, | ||||
|         charsets=["ISO-8859-11", "TIS-620", "CP874"], | ||||
|         alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛", | ||||
|         wiki_start_pages=["หน้าหลัก"], | ||||
|     ), | ||||
|     "Turkish": Language( | ||||
|         name="Turkish", | ||||
|         iso_code="tr", | ||||
|         # Q, W, and X are not used by Turkish | ||||
|         use_ascii=False, | ||||
|         charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"], | ||||
|         alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ", | ||||
|         wiki_start_pages=["Ana_Sayfa"], | ||||
|     ), | ||||
|     "Vietnamese": Language( | ||||
|         name="Vietnamese", | ||||
|         iso_code="vi", | ||||
|         use_ascii=False, | ||||
|         # Windows-1258 is the only common 8-bit | ||||
|         # Vietnamese encoding supported by Python. | ||||
|         # From Wikipedia: | ||||
|         # For systems that lack support for Unicode, | ||||
|         # dozens of 8-bit Vietnamese code pages are | ||||
|         # available.[1] The most common are VISCII | ||||
|         # (TCVN 5712:1993), VPS, and Windows-1258.[3] | ||||
|         # Where ASCII is required, such as when | ||||
|         # ensuring readability in plain text e-mail, | ||||
|         # Vietnamese letters are often encoded | ||||
|         # according to Vietnamese Quoted-Readable | ||||
|         # (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4] | ||||
|         # though usage of either variable-width | ||||
|         # scheme has declined dramatically following | ||||
|         # the adoption of Unicode on the World Wide | ||||
|         # Web. | ||||
|         charsets=["WINDOWS-1258"], | ||||
|         alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY", | ||||
|         wiki_start_pages=["Chữ_Quốc_ngữ"], | ||||
|     ), | ||||
| } | ||||
		Reference in New Issue
	
	Block a user