Pop-up dictionary browser extension for language learning. Successor to Yomichan. (PERSONAL FORK)
1/*
2 * Copyright (C) 2024-2025 Yomitan Authors
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18import {removeSyriacScriptDiacritics} from './aii/assyrian-neo-aramaic-text-preprocessors.js';
19import {
20 addHamzaBottom,
21 addHamzaTop,
22 convertAlifMaqsuraToYaa,
23 convertHaToTaMarbuta,
24 normalizeUnicode,
25 removeArabicScriptDiacritics,
26 removeTatweel,
27} from './ar/arabic-text-preprocessors.js';
28import {arabicTransforms} from './ar/arabic-transforms.js';
29import {normalizeRadicalCharacters} from './CJK-util.js';
30import {eszettPreprocessor} from './de/german-text-preprocessors.js';
31import {germanTransforms} from './de/german-transforms.js';
32import {removeDoubleAcuteAccents} from './el/modern-greek-processors.js';
33import {englishTransforms} from './en/english-transforms.js';
34import {esperantoTransforms} from './eo/esperanto-transforms.js';
35import {spanishTransforms} from './es/spanish-transforms.js';
36import {apostropheVariants} from './fr/french-text-preprocessors.js';
37import {frenchTransforms} from './fr/french-transforms.js';
38import {irishTransforms} from './ga/irish-transforms.js';
39import {convertLatinToGreek} from './grc/ancient-greek-processors.js';
40import {ancientGreekTransforms} from './grc/ancient-greek-transforms.js';
41import {removeApostrophedWords} from './it/italian-processors.js';
42import {
43 alphabeticToHiragana,
44 alphanumericWidthVariants,
45 collapseEmphaticSequences,
46 convertHalfWidthCharacters,
47 convertHiraganaToKatakana,
48 normalizeCJKCompatibilityCharacters,
49 normalizeCombiningCharacters,
50 standardizeKanji,
51} from './ja/japanese-text-preprocessors.js';
52import {japaneseTransforms} from './ja/japanese-transforms.js';
53import {isStringPartiallyJapanese} from './ja/japanese.js';
54import {georgianTransforms} from './ka/georgian-transforms.js';
55import {disassembleHangul, reassembleHangul} from './ko/korean-text-processors.js';
56import {koreanTransforms} from './ko/korean-transforms.js';
57import {processDiphtongs} from './la/latin-text-preprocessors.js';
58import {latinTransforms} from './la/latin-transforms.js';
59import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js';
60import {oldIrishTransforms} from './sga/old-irish-transforms.js';
61import {addSerboCroatianDiacritics, removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preprocessors.js';
62import {albanianTransforms} from './sq/albanian-transforms.js';
63import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js';
64import {tagalogTransforms} from './tl/tagalog-transforms.js';
65import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';
66import {convertFinalLetters, convertYiddishLigatures} from './yi/yiddish-text-postprocessors.js';
67import {combineYiddishLigatures, removeYiddishDiacritics} from './yi/yiddish-text-preprocessors.js';
68import {yiddishTransforms} from './yi/yiddish-transforms.js';
69import {isStringPartiallyChinese, normalizePinyin} from './zh/chinese.js';
70
71const capitalizationPreprocessors = {
72 decapitalize,
73 capitalizeFirstLetter,
74};
75
76/** @type {import('language-descriptors').LanguageDescriptorAny[]} */
77const languageDescriptors = [
78 {
79 iso: 'aii',
80 iso639_3: 'aii',
81 name: 'Assyrian Neo-Aramaic',
82 exampleText: 'ܟܵܬܹܒ݂',
83 textPreprocessors: {
84 removeSyriacScriptDiacritics,
85 },
86 },
87 {
88 iso: 'ar',
89 iso639_3: 'ara',
90 name: 'Arabic (MSA)',
91 exampleText: 'قَرَأَ',
92 textPreprocessors: {
93 removeArabicScriptDiacritics,
94 removeTatweel,
95 normalizeUnicode,
96 addHamzaTop,
97 addHamzaBottom,
98 convertAlifMaqsuraToYaa,
99 },
100 languageTransforms: arabicTransforms,
101 },
102 {
103 iso: 'arz',
104 iso639_3: 'arz',
105 name: 'Arabic (Egyptian)',
106 exampleText: 'قَرَأَ',
107 textPreprocessors: {
108 removeArabicScriptDiacritics,
109 removeTatweel,
110 normalizeUnicode,
111 addHamzaTop,
112 addHamzaBottom,
113 convertAlifMaqsuraToYaa,
114 convertHaToTaMarbuta,
115 },
116 languageTransforms: arabicTransforms,
117 },
118 {
119 iso: 'be',
120 iso639_3: 'bel',
121 name: 'Belarusian',
122 exampleText: 'чытаць',
123 textPreprocessors: capitalizationPreprocessors,
124 },
125 {
126 iso: 'bg',
127 iso639_3: 'bul',
128 name: 'Bulgarian',
129 exampleText: 'чета',
130 textPreprocessors: capitalizationPreprocessors,
131 },
132 {
133 iso: 'cs',
134 iso639_3: 'ces',
135 name: 'Czech',
136 exampleText: 'číst',
137 textPreprocessors: capitalizationPreprocessors,
138 },
139 {
140 iso: 'da',
141 iso639_3: 'dan',
142 name: 'Danish',
143 exampleText: 'læse',
144 textPreprocessors: {
145 ...capitalizationPreprocessors,
146 },
147 },
148 {
149 iso: 'de',
150 iso639_3: 'deu',
151 name: 'German',
152 exampleText: 'lesen',
153 textPreprocessors: {
154 ...capitalizationPreprocessors,
155 eszettPreprocessor,
156 },
157 languageTransforms: germanTransforms,
158 },
159 {
160 iso: 'el',
161 iso639_3: 'ell',
162 name: 'Greek',
163 exampleText: 'διαβάζω',
164 textPreprocessors: {
165 ...capitalizationPreprocessors,
166 removeDoubleAcuteAccents,
167 },
168 },
169 {
170 iso: 'en',
171 iso639_3: 'eng',
172 name: 'English',
173 exampleText: 'read',
174 textPreprocessors: capitalizationPreprocessors,
175 languageTransforms: englishTransforms,
176 },
177 {
178 iso: 'eo',
179 iso639_3: 'epo',
180 name: 'Esperanto',
181 exampleText: 'legi',
182 textPreprocessors: capitalizationPreprocessors,
183 languageTransforms: esperantoTransforms,
184 },
185 {
186 iso: 'es',
187 iso639_3: 'spa',
188 name: 'Spanish',
189 exampleText: 'leer',
190 textPreprocessors: capitalizationPreprocessors,
191 languageTransforms: spanishTransforms,
192 },
193 {
194 iso: 'et',
195 iso639_3: 'est',
196 name: 'Estonian',
197 exampleText: 'lugema',
198 textPreprocessors: capitalizationPreprocessors,
199 },
200 {
201 iso: 'fa',
202 iso639_3: 'fas',
203 name: 'Persian',
204 exampleText: 'خواندن',
205 textPreprocessors: {
206 removeArabicScriptDiacritics,
207 },
208 },
209 {
210 iso: 'fi',
211 iso639_3: 'fin',
212 name: 'Finnish',
213 exampleText: 'lukea',
214 textPreprocessors: capitalizationPreprocessors,
215 },
216 {
217 iso: 'fr',
218 iso639_3: 'fra',
219 name: 'French',
220 exampleText: 'lire',
221 textPreprocessors: {
222 ...capitalizationPreprocessors,
223 apostropheVariants,
224 },
225 languageTransforms: frenchTransforms,
226 },
227 {
228 iso: 'ga',
229 iso639_3: 'gle',
230 name: 'Irish',
231 exampleText: 'léigh',
232 textPreprocessors: capitalizationPreprocessors,
233 languageTransforms: irishTransforms,
234 },
235 {
236 iso: 'grc',
237 iso639_3: 'grc',
238 name: 'Ancient Greek',
239 exampleText: 'γράφω', /* 'to write' */
240 textPreprocessors: {
241 ...capitalizationPreprocessors,
242 removeAlphabeticDiacritics,
243 convertLatinToGreek,
244 },
245 languageTransforms: ancientGreekTransforms,
246 },
247 {
248 // no 2 letter iso for hawaiian
249 iso: 'haw',
250 iso639_3: 'haw',
251 name: 'Hawaiian',
252 exampleText: 'heluhelu',
253 textPreprocessors: capitalizationPreprocessors,
254 },
255 {
256 iso: 'he',
257 iso639_3: 'heb',
258 name: 'Hebrew',
259 exampleText: 'קריאה',
260 },
261 {
262 iso: 'hi',
263 iso639_3: 'hin',
264 name: 'Hindi',
265 exampleText: 'पढ़ने के लिए',
266 },
267 {
268 iso: 'hu',
269 iso639_3: 'hun',
270 name: 'Hungarian',
271 exampleText: 'olvasni',
272 textPreprocessors: capitalizationPreprocessors,
273 },
274 {
275 iso: 'id',
276 iso639_3: 'ind',
277 name: 'Indonesian',
278 exampleText: 'baca',
279 textPreprocessors: {
280 ...capitalizationPreprocessors,
281 removeAlphabeticDiacritics,
282 },
283 },
284 {
285 iso: 'it',
286 iso639_3: 'ita',
287 name: 'Italian',
288 exampleText: 'leggere',
289 textPreprocessors: {
290 ...capitalizationPreprocessors,
291 removeAlphabeticDiacritics,
292 removeApostrophedWords,
293 },
294 },
295 {
296 iso: 'la',
297 iso639_3: 'lat',
298 name: 'Latin',
299 exampleText: 'legō',
300 textPreprocessors: {
301 ...capitalizationPreprocessors,
302 removeAlphabeticDiacritics,
303 processDiphtongs,
304 },
305 languageTransforms: latinTransforms,
306 },
307 {
308 iso: 'lo',
309 iso639_3: 'lao',
310 name: 'Lao',
311 exampleText: 'ອ່ານ',
312 },
313 {
314 iso: 'lv',
315 iso639_3: 'lav',
316 name: 'Latvian',
317 exampleText: 'lasīt',
318 textPreprocessors: capitalizationPreprocessors,
319 },
320 {
321 iso: 'ja',
322 iso639_3: 'jpn',
323 name: 'Japanese',
324 exampleText: '読め',
325 isTextLookupWorthy: isStringPartiallyJapanese,
326 textPreprocessors: {
327 convertHalfWidthCharacters,
328 alphabeticToHiragana,
329 normalizeCombiningCharacters,
330 normalizeCJKCompatibilityCharacters,
331 normalizeRadicalCharacters,
332 alphanumericWidthVariants,
333 convertHiraganaToKatakana,
334 collapseEmphaticSequences,
335 standardizeKanji,
336 },
337 languageTransforms: japaneseTransforms,
338 },
339 {
340 iso: 'ka',
341 iso639_3: 'kat',
342 name: 'Georgian',
343 exampleText: 'კითხვა', // Georgian for “read”
344 languageTransforms: georgianTransforms,
345 },
346 {
347 iso: 'kn',
348 iso639_3: 'kan',
349 name: 'Kannada',
350 exampleText: 'ಓದು',
351 },
352 {
353 iso: 'km',
354 iso639_3: 'khm',
355 name: 'Khmer',
356 exampleText: 'អាន',
357 },
358 {
359 iso: 'ko',
360 iso639_3: 'kor',
361 name: 'Korean',
362 exampleText: '읽어',
363 textPreprocessors: {
364 disassembleHangul,
365 },
366 textPostprocessors: {
367 reassembleHangul,
368 },
369 languageTransforms: koreanTransforms,
370 },
371 {
372 iso: 'mn',
373 iso639_3: 'mon',
374 name: 'Mongolian',
375 exampleText: 'унших',
376 textPreprocessors: capitalizationPreprocessors,
377 },
378 {
379 iso: 'mt',
380 iso639_3: 'mlt',
381 name: 'Maltese',
382 exampleText: 'kiteb',
383 textPreprocessors: capitalizationPreprocessors,
384 },
385 {
386 iso: 'nl',
387 iso639_3: 'nld',
388 name: 'Dutch',
389 exampleText: 'lezen',
390 textPreprocessors: capitalizationPreprocessors,
391 },
392 {
393 iso: 'no',
394 iso639_3: 'nor',
395 name: 'Norwegian',
396 exampleText: 'lese',
397 textPreprocessors: {
398 ...capitalizationPreprocessors,
399 },
400 },
401 {
402 iso: 'pl',
403 iso639_3: 'pol',
404 name: 'Polish',
405 exampleText: 'czytać',
406 textPreprocessors: capitalizationPreprocessors,
407 },
408 {
409 iso: 'pt',
410 iso639_3: 'por',
411 name: 'Portuguese',
412 exampleText: 'ler',
413 textPreprocessors: capitalizationPreprocessors,
414 },
415 {
416 iso: 'ro',
417 iso639_3: 'ron',
418 name: 'Romanian',
419 exampleText: 'citi',
420 textPreprocessors: {
421 ...capitalizationPreprocessors,
422 removeAlphabeticDiacritics,
423 },
424 },
425 {
426 iso: 'ru',
427 iso639_3: 'rus',
428 name: 'Russian',
429 exampleText: 'читать',
430 textPreprocessors: {
431 ...capitalizationPreprocessors,
432 yoToE,
433 removeRussianDiacritics,
434 },
435 },
436 {
437 iso: 'sga',
438 iso639_3: 'sga',
439 name: 'Old Irish',
440 exampleText: 'légaid',
441 textPreprocessors: {
442 ...capitalizationPreprocessors,
443 removeAlphabeticDiacritics,
444 },
445 languageTransforms: oldIrishTransforms,
446 },
447 {
448 iso: 'sh',
449 iso639_3: 'hbs',
450 name: 'Serbo-Croatian',
451 exampleText: 'čìtati',
452 textPreprocessors: {
453 ...capitalizationPreprocessors,
454 removeSerboCroatianAccentMarks,
455 addSerboCroatianDiacritics,
456 },
457 },
458 {
459 iso: 'sq',
460 iso639_3: 'sqi',
461 name: 'Albanian',
462 exampleText: 'ndihmoj', /* 'to help' */
463 textPreprocessors: capitalizationPreprocessors,
464 languageTransforms: albanianTransforms,
465 },
466 {
467 iso: 'sv',
468 iso639_3: 'swe',
469 name: 'Swedish',
470 exampleText: 'läsa',
471 textPreprocessors: capitalizationPreprocessors,
472 },
473 {
474 iso: 'th',
475 iso639_3: 'tha',
476 name: 'Thai',
477 exampleText: 'อ่าน',
478 },
479 {
480 iso: 'tl',
481 iso639_3: 'tgl',
482 name: 'Tagalog',
483 exampleText: 'basahin',
484 textPreprocessors: {
485 ...capitalizationPreprocessors,
486 removeAlphabeticDiacritics,
487 },
488 languageTransforms: tagalogTransforms,
489 },
490 {
491 iso: 'tr',
492 iso639_3: 'tur',
493 name: 'Turkish',
494 exampleText: 'okumak',
495 textPreprocessors: capitalizationPreprocessors,
496 },
497 {
498 iso: 'tok',
499 iso639_3: 'tok',
500 name: 'Toki Pona',
501 exampleText: 'wile',
502 textPreprocessors: capitalizationPreprocessors,
503 },
504 {
505 iso: 'uk',
506 iso639_3: 'ukr',
507 name: 'Ukrainian',
508 exampleText: 'читати',
509 textPreprocessors: capitalizationPreprocessors,
510 },
511 {
512 iso: 'vi',
513 iso639_3: 'vie',
514 name: 'Vietnamese',
515 exampleText: 'đọc',
516 textPreprocessors: {
517 ...capitalizationPreprocessors,
518 normalizeDiacritics,
519 },
520 },
521 {
522 iso: 'cy',
523 iso639_3: 'cym',
524 name: 'Welsh',
525 exampleText: 'ddarllen',
526 textPreprocessors: capitalizationPreprocessors,
527 },
528 {
529 iso: 'yi',
530 iso639_3: 'yid',
531 name: 'Yiddish',
532 exampleText: 'באַשאַפֿן',
533 textPreprocessors: {
534 removeYiddishDiacritics,
535 combineYiddishLigatures,
536 },
537 textPostprocessors: {
538 convertFinalLetters,
539 convertYiddishLigatures,
540 },
541 languageTransforms: yiddishTransforms,
542 },
543 {
544 iso: 'yue',
545 iso639_3: 'yue',
546 name: 'Cantonese',
547 exampleText: '讀',
548 textPreprocessors: {
549 normalizeRadicalCharacters,
550 },
551 },
552 {
553 iso: 'zh',
554 iso639_3: 'zho',
555 name: 'Chinese',
556 exampleText: '读',
557 isTextLookupWorthy: isStringPartiallyChinese,
558 readingNormalizer: normalizePinyin,
559 textPreprocessors: {
560 normalizeRadicalCharacters,
561 },
562 },
563];
564
565/** @type {Map<string, import('language-descriptors').LanguageDescriptorAny>} */
566export const languageDescriptorMap = new Map();
567for (const languageDescriptor of languageDescriptors) {
568 languageDescriptorMap.set(languageDescriptor.iso, languageDescriptor);
569}