ext/js/language/language-descriptors.js at lambda-fork/main

onedeuxtriseigo.nullpo.dev / yomitan
fork atom
Pop-up dictionary browser extension for language learning. Successor to Yomichan. (PERSONAL FORK)
fork atom
yomitan / ext / js / language / language-descriptors.js
at lambda-fork/main 569 lines 16 kB view raw
wrap content
daxida [it] Support removing apostrophed words in Italian (#2318) 18d ago
0cbc1fe6
  1/*
  2 * Copyright (C) 2024-2025  Yomitan Authors
  3 *
  4 * This program is free software: you can redistribute it and/or modify
  5 * it under the terms of the GNU General Public License as published by
  6 * the Free Software Foundation, either version 3 of the License, or
  7 * (at your option) any later version.
  8 *
  9 * This program is distributed in the hope that it will be useful,
 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 * GNU General Public License for more details.
 13 *
 14 * You should have received a copy of the GNU General Public License
 15 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 16 */
 17
 18import {removeSyriacScriptDiacritics} from './aii/assyrian-neo-aramaic-text-preprocessors.js';
 19import {
 20    addHamzaBottom,
 21    addHamzaTop,
 22    convertAlifMaqsuraToYaa,
 23    convertHaToTaMarbuta,
 24    normalizeUnicode,
 25    removeArabicScriptDiacritics,
 26    removeTatweel,
 27} from './ar/arabic-text-preprocessors.js';
 28import {arabicTransforms} from './ar/arabic-transforms.js';
 29import {normalizeRadicalCharacters} from './CJK-util.js';
 30import {eszettPreprocessor} from './de/german-text-preprocessors.js';
 31import {germanTransforms} from './de/german-transforms.js';
 32import {removeDoubleAcuteAccents} from './el/modern-greek-processors.js';
 33import {englishTransforms} from './en/english-transforms.js';
 34import {esperantoTransforms} from './eo/esperanto-transforms.js';
 35import {spanishTransforms} from './es/spanish-transforms.js';
 36import {apostropheVariants} from './fr/french-text-preprocessors.js';
 37import {frenchTransforms} from './fr/french-transforms.js';
 38import {irishTransforms} from './ga/irish-transforms.js';
 39import {convertLatinToGreek} from './grc/ancient-greek-processors.js';
 40import {ancientGreekTransforms} from './grc/ancient-greek-transforms.js';
 41import {removeApostrophedWords} from './it/italian-processors.js';
 42import {
 43    alphabeticToHiragana,
 44    alphanumericWidthVariants,
 45    collapseEmphaticSequences,
 46    convertHalfWidthCharacters,
 47    convertHiraganaToKatakana,
 48    normalizeCJKCompatibilityCharacters,
 49    normalizeCombiningCharacters,
 50    standardizeKanji,
 51} from './ja/japanese-text-preprocessors.js';
 52import {japaneseTransforms} from './ja/japanese-transforms.js';
 53import {isStringPartiallyJapanese} from './ja/japanese.js';
 54import {georgianTransforms} from './ka/georgian-transforms.js';
 55import {disassembleHangul, reassembleHangul} from './ko/korean-text-processors.js';
 56import {koreanTransforms} from './ko/korean-transforms.js';
 57import {processDiphtongs} from './la/latin-text-preprocessors.js';
 58import {latinTransforms} from './la/latin-transforms.js';
 59import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js';
 60import {oldIrishTransforms} from './sga/old-irish-transforms.js';
 61import {addSerboCroatianDiacritics, removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preprocessors.js';
 62import {albanianTransforms} from './sq/albanian-transforms.js';
 63import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js';
 64import {tagalogTransforms} from './tl/tagalog-transforms.js';
 65import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';
 66import {convertFinalLetters, convertYiddishLigatures} from './yi/yiddish-text-postprocessors.js';
 67import {combineYiddishLigatures, removeYiddishDiacritics} from './yi/yiddish-text-preprocessors.js';
 68import {yiddishTransforms} from './yi/yiddish-transforms.js';
 69import {isStringPartiallyChinese, normalizePinyin} from './zh/chinese.js';
 70
 71const capitalizationPreprocessors = {
 72    decapitalize,
 73    capitalizeFirstLetter,
 74};
 75
 76/** @type {import('language-descriptors').LanguageDescriptorAny[]} */
 77const languageDescriptors = [
 78    {
 79        iso: 'aii',
 80        iso639_3: 'aii',
 81        name: 'Assyrian Neo-Aramaic',
 82        exampleText: 'ܟܵܬܹܒ݂',
 83        textPreprocessors: {
 84            removeSyriacScriptDiacritics,
 85        },
 86    },
 87    {
 88        iso: 'ar',
 89        iso639_3: 'ara',
 90        name: 'Arabic (MSA)',
 91        exampleText: 'قَرَأَ',
 92        textPreprocessors: {
 93            removeArabicScriptDiacritics,
 94            removeTatweel,
 95            normalizeUnicode,
 96            addHamzaTop,
 97            addHamzaBottom,
 98            convertAlifMaqsuraToYaa,
 99        },
100        languageTransforms: arabicTransforms,
101    },
102    {
103        iso: 'arz',
104        iso639_3: 'arz',
105        name: 'Arabic (Egyptian)',
106        exampleText: 'قَرَأَ',
107        textPreprocessors: {
108            removeArabicScriptDiacritics,
109            removeTatweel,
110            normalizeUnicode,
111            addHamzaTop,
112            addHamzaBottom,
113            convertAlifMaqsuraToYaa,
114            convertHaToTaMarbuta,
115        },
116        languageTransforms: arabicTransforms,
117    },
118    {
119        iso: 'be',
120        iso639_3: 'bel',
121        name: 'Belarusian',
122        exampleText: 'чытаць',
123        textPreprocessors: capitalizationPreprocessors,
124    },
125    {
126        iso: 'bg',
127        iso639_3: 'bul',
128        name: 'Bulgarian',
129        exampleText: 'чета',
130        textPreprocessors: capitalizationPreprocessors,
131    },
132    {
133        iso: 'cs',
134        iso639_3: 'ces',
135        name: 'Czech',
136        exampleText: 'číst',
137        textPreprocessors: capitalizationPreprocessors,
138    },
139    {
140        iso: 'da',
141        iso639_3: 'dan',
142        name: 'Danish',
143        exampleText: 'læse',
144        textPreprocessors: {
145            ...capitalizationPreprocessors,
146        },
147    },
148    {
149        iso: 'de',
150        iso639_3: 'deu',
151        name: 'German',
152        exampleText: 'lesen',
153        textPreprocessors: {
154            ...capitalizationPreprocessors,
155            eszettPreprocessor,
156        },
157        languageTransforms: germanTransforms,
158    },
159    {
160        iso: 'el',
161        iso639_3: 'ell',
162        name: 'Greek',
163        exampleText: 'διαβάζω',
164        textPreprocessors: {
165            ...capitalizationPreprocessors,
166            removeDoubleAcuteAccents,
167        },
168    },
169    {
170        iso: 'en',
171        iso639_3: 'eng',
172        name: 'English',
173        exampleText: 'read',
174        textPreprocessors: capitalizationPreprocessors,
175        languageTransforms: englishTransforms,
176    },
177    {
178        iso: 'eo',
179        iso639_3: 'epo',
180        name: 'Esperanto',
181        exampleText: 'legi',
182        textPreprocessors: capitalizationPreprocessors,
183        languageTransforms: esperantoTransforms,
184    },
185    {
186        iso: 'es',
187        iso639_3: 'spa',
188        name: 'Spanish',
189        exampleText: 'leer',
190        textPreprocessors: capitalizationPreprocessors,
191        languageTransforms: spanishTransforms,
192    },
193    {
194        iso: 'et',
195        iso639_3: 'est',
196        name: 'Estonian',
197        exampleText: 'lugema',
198        textPreprocessors: capitalizationPreprocessors,
199    },
200    {
201        iso: 'fa',
202        iso639_3: 'fas',
203        name: 'Persian',
204        exampleText: 'خواندن',
205        textPreprocessors: {
206            removeArabicScriptDiacritics,
207        },
208    },
209    {
210        iso: 'fi',
211        iso639_3: 'fin',
212        name: 'Finnish',
213        exampleText: 'lukea',
214        textPreprocessors: capitalizationPreprocessors,
215    },
216    {
217        iso: 'fr',
218        iso639_3: 'fra',
219        name: 'French',
220        exampleText: 'lire',
221        textPreprocessors: {
222            ...capitalizationPreprocessors,
223            apostropheVariants,
224        },
225        languageTransforms: frenchTransforms,
226    },
227    {
228        iso: 'ga',
229        iso639_3: 'gle',
230        name: 'Irish',
231        exampleText: 'léigh',
232        textPreprocessors: capitalizationPreprocessors,
233        languageTransforms: irishTransforms,
234    },
235    {
236        iso: 'grc',
237        iso639_3: 'grc',
238        name: 'Ancient Greek',
239        exampleText: 'γράφω', /* 'to write' */
240        textPreprocessors: {
241            ...capitalizationPreprocessors,
242            removeAlphabeticDiacritics,
243            convertLatinToGreek,
244        },
245        languageTransforms: ancientGreekTransforms,
246    },
247    {
248        // no 2 letter iso for hawaiian
249        iso: 'haw',
250        iso639_3: 'haw',
251        name: 'Hawaiian',
252        exampleText: 'heluhelu',
253        textPreprocessors: capitalizationPreprocessors,
254    },
255    {
256        iso: 'he',
257        iso639_3: 'heb',
258        name: 'Hebrew',
259        exampleText: 'קריאה',
260    },
261    {
262        iso: 'hi',
263        iso639_3: 'hin',
264        name: 'Hindi',
265        exampleText: 'पढ़ने के लिए',
266    },
267    {
268        iso: 'hu',
269        iso639_3: 'hun',
270        name: 'Hungarian',
271        exampleText: 'olvasni',
272        textPreprocessors: capitalizationPreprocessors,
273    },
274    {
275        iso: 'id',
276        iso639_3: 'ind',
277        name: 'Indonesian',
278        exampleText: 'baca',
279        textPreprocessors: {
280            ...capitalizationPreprocessors,
281            removeAlphabeticDiacritics,
282        },
283    },
284    {
285        iso: 'it',
286        iso639_3: 'ita',
287        name: 'Italian',
288        exampleText: 'leggere',
289        textPreprocessors: {
290            ...capitalizationPreprocessors,
291            removeAlphabeticDiacritics,
292            removeApostrophedWords,
293        },
294    },
295    {
296        iso: 'la',
297        iso639_3: 'lat',
298        name: 'Latin',
299        exampleText: 'legō',
300        textPreprocessors: {
301            ...capitalizationPreprocessors,
302            removeAlphabeticDiacritics,
303            processDiphtongs,
304        },
305        languageTransforms: latinTransforms,
306    },
307    {
308        iso: 'lo',
309        iso639_3: 'lao',
310        name: 'Lao',
311        exampleText: 'ອ່ານ',
312    },
313    {
314        iso: 'lv',
315        iso639_3: 'lav',
316        name: 'Latvian',
317        exampleText: 'lasīt',
318        textPreprocessors: capitalizationPreprocessors,
319    },
320    {
321        iso: 'ja',
322        iso639_3: 'jpn',
323        name: 'Japanese',
324        exampleText: '読め',
325        isTextLookupWorthy: isStringPartiallyJapanese,
326        textPreprocessors: {
327            convertHalfWidthCharacters,
328            alphabeticToHiragana,
329            normalizeCombiningCharacters,
330            normalizeCJKCompatibilityCharacters,
331            normalizeRadicalCharacters,
332            alphanumericWidthVariants,
333            convertHiraganaToKatakana,
334            collapseEmphaticSequences,
335            standardizeKanji,
336        },
337        languageTransforms: japaneseTransforms,
338    },
339    {
340        iso: 'ka',
341        iso639_3: 'kat',
342        name: 'Georgian',
343        exampleText: 'კითხვა', // Georgian for “read”
344        languageTransforms: georgianTransforms,
345    },
346    {
347        iso: 'kn',
348        iso639_3: 'kan',
349        name: 'Kannada',
350        exampleText: 'ಓದು',
351    },
352    {
353        iso: 'km',
354        iso639_3: 'khm',
355        name: 'Khmer',
356        exampleText: 'អាន',
357    },
358    {
359        iso: 'ko',
360        iso639_3: 'kor',
361        name: 'Korean',
362        exampleText: '읽어',
363        textPreprocessors: {
364            disassembleHangul,
365        },
366        textPostprocessors: {
367            reassembleHangul,
368        },
369        languageTransforms: koreanTransforms,
370    },
371    {
372        iso: 'mn',
373        iso639_3: 'mon',
374        name: 'Mongolian',
375        exampleText: 'унших',
376        textPreprocessors: capitalizationPreprocessors,
377    },
378    {
379        iso: 'mt',
380        iso639_3: 'mlt',
381        name: 'Maltese',
382        exampleText: 'kiteb',
383        textPreprocessors: capitalizationPreprocessors,
384    },
385    {
386        iso: 'nl',
387        iso639_3: 'nld',
388        name: 'Dutch',
389        exampleText: 'lezen',
390        textPreprocessors: capitalizationPreprocessors,
391    },
392    {
393        iso: 'no',
394        iso639_3: 'nor',
395        name: 'Norwegian',
396        exampleText: 'lese',
397        textPreprocessors: {
398            ...capitalizationPreprocessors,
399        },
400    },
401    {
402        iso: 'pl',
403        iso639_3: 'pol',
404        name: 'Polish',
405        exampleText: 'czytać',
406        textPreprocessors: capitalizationPreprocessors,
407    },
408    {
409        iso: 'pt',
410        iso639_3: 'por',
411        name: 'Portuguese',
412        exampleText: 'ler',
413        textPreprocessors: capitalizationPreprocessors,
414    },
415    {
416        iso: 'ro',
417        iso639_3: 'ron',
418        name: 'Romanian',
419        exampleText: 'citi',
420        textPreprocessors: {
421            ...capitalizationPreprocessors,
422            removeAlphabeticDiacritics,
423        },
424    },
425    {
426        iso: 'ru',
427        iso639_3: 'rus',
428        name: 'Russian',
429        exampleText: 'читать',
430        textPreprocessors: {
431            ...capitalizationPreprocessors,
432            yoToE,
433            removeRussianDiacritics,
434        },
435    },
436    {
437        iso: 'sga',
438        iso639_3: 'sga',
439        name: 'Old Irish',
440        exampleText: 'légaid',
441        textPreprocessors: {
442            ...capitalizationPreprocessors,
443            removeAlphabeticDiacritics,
444        },
445        languageTransforms: oldIrishTransforms,
446    },
447    {
448        iso: 'sh',
449        iso639_3: 'hbs',
450        name: 'Serbo-Croatian',
451        exampleText: 'čìtati',
452        textPreprocessors: {
453            ...capitalizationPreprocessors,
454            removeSerboCroatianAccentMarks,
455            addSerboCroatianDiacritics,
456        },
457    },
458    {
459        iso: 'sq',
460        iso639_3: 'sqi',
461        name: 'Albanian',
462        exampleText: 'ndihmoj', /* 'to help' */
463        textPreprocessors: capitalizationPreprocessors,
464        languageTransforms: albanianTransforms,
465    },
466    {
467        iso: 'sv',
468        iso639_3: 'swe',
469        name: 'Swedish',
470        exampleText: 'läsa',
471        textPreprocessors: capitalizationPreprocessors,
472    },
473    {
474        iso: 'th',
475        iso639_3: 'tha',
476        name: 'Thai',
477        exampleText: 'อ่าน',
478    },
479    {
480        iso: 'tl',
481        iso639_3: 'tgl',
482        name: 'Tagalog',
483        exampleText: 'basahin',
484        textPreprocessors: {
485            ...capitalizationPreprocessors,
486            removeAlphabeticDiacritics,
487        },
488        languageTransforms: tagalogTransforms,
489    },
490    {
491        iso: 'tr',
492        iso639_3: 'tur',
493        name: 'Turkish',
494        exampleText: 'okumak',
495        textPreprocessors: capitalizationPreprocessors,
496    },
497    {
498        iso: 'tok',
499        iso639_3: 'tok',
500        name: 'Toki Pona',
501        exampleText: 'wile',
502        textPreprocessors: capitalizationPreprocessors,
503    },
504    {
505        iso: 'uk',
506        iso639_3: 'ukr',
507        name: 'Ukrainian',
508        exampleText: 'читати',
509        textPreprocessors: capitalizationPreprocessors,
510    },
511    {
512        iso: 'vi',
513        iso639_3: 'vie',
514        name: 'Vietnamese',
515        exampleText: 'đọc',
516        textPreprocessors: {
517            ...capitalizationPreprocessors,
518            normalizeDiacritics,
519        },
520    },
521    {
522        iso: 'cy',
523        iso639_3: 'cym',
524        name: 'Welsh',
525        exampleText: 'ddarllen',
526        textPreprocessors: capitalizationPreprocessors,
527    },
528    {
529        iso: 'yi',
530        iso639_3: 'yid',
531        name: 'Yiddish',
532        exampleText: 'באַשאַפֿן',
533        textPreprocessors: {
534            removeYiddishDiacritics,
535            combineYiddishLigatures,
536        },
537        textPostprocessors: {
538            convertFinalLetters,
539            convertYiddishLigatures,
540        },
541        languageTransforms: yiddishTransforms,
542    },
543    {
544        iso: 'yue',
545        iso639_3: 'yue',
546        name: 'Cantonese',
547        exampleText: '讀',
548        textPreprocessors: {
549            normalizeRadicalCharacters,
550        },
551    },
552    {
553        iso: 'zh',
554        iso639_3: 'zho',
555        name: 'Chinese',
556        exampleText: '读',
557        isTextLookupWorthy: isStringPartiallyChinese,
558        readingNormalizer: normalizePinyin,
559        textPreprocessors: {
560            normalizeRadicalCharacters,
561        },
562    },
563];
564
565/** @type {Map<string, import('language-descriptors').LanguageDescriptorAny>} */
566export const languageDescriptorMap = new Map();
567for (const languageDescriptor of languageDescriptors) {
568    languageDescriptorMap.set(languageDescriptor.iso, languageDescriptor);
569}