Pop-up dictionary browser extension for language learning. Successor to Yomichan. (PERSONAL FORK)
at lambda-fork/main 569 lines 16 kB view raw
1/* 2 * Copyright (C) 2024-2025 Yomitan Authors 3 * 4 * This program is free software: you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation, either version 3 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program. If not, see <https://www.gnu.org/licenses/>. 16 */ 17 18import {removeSyriacScriptDiacritics} from './aii/assyrian-neo-aramaic-text-preprocessors.js'; 19import { 20 addHamzaBottom, 21 addHamzaTop, 22 convertAlifMaqsuraToYaa, 23 convertHaToTaMarbuta, 24 normalizeUnicode, 25 removeArabicScriptDiacritics, 26 removeTatweel, 27} from './ar/arabic-text-preprocessors.js'; 28import {arabicTransforms} from './ar/arabic-transforms.js'; 29import {normalizeRadicalCharacters} from './CJK-util.js'; 30import {eszettPreprocessor} from './de/german-text-preprocessors.js'; 31import {germanTransforms} from './de/german-transforms.js'; 32import {removeDoubleAcuteAccents} from './el/modern-greek-processors.js'; 33import {englishTransforms} from './en/english-transforms.js'; 34import {esperantoTransforms} from './eo/esperanto-transforms.js'; 35import {spanishTransforms} from './es/spanish-transforms.js'; 36import {apostropheVariants} from './fr/french-text-preprocessors.js'; 37import {frenchTransforms} from './fr/french-transforms.js'; 38import {irishTransforms} from './ga/irish-transforms.js'; 39import {convertLatinToGreek} from './grc/ancient-greek-processors.js'; 40import {ancientGreekTransforms} from './grc/ancient-greek-transforms.js'; 41import {removeApostrophedWords} from './it/italian-processors.js'; 42import { 43 alphabeticToHiragana, 44 alphanumericWidthVariants, 45 collapseEmphaticSequences, 46 convertHalfWidthCharacters, 47 convertHiraganaToKatakana, 48 normalizeCJKCompatibilityCharacters, 49 normalizeCombiningCharacters, 50 standardizeKanji, 51} from './ja/japanese-text-preprocessors.js'; 52import {japaneseTransforms} from './ja/japanese-transforms.js'; 53import {isStringPartiallyJapanese} from './ja/japanese.js'; 54import {georgianTransforms} from './ka/georgian-transforms.js'; 55import {disassembleHangul, reassembleHangul} from './ko/korean-text-processors.js'; 56import {koreanTransforms} from './ko/korean-transforms.js'; 57import {processDiphtongs} from './la/latin-text-preprocessors.js'; 58import {latinTransforms} from './la/latin-transforms.js'; 59import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js'; 60import {oldIrishTransforms} from './sga/old-irish-transforms.js'; 61import {addSerboCroatianDiacritics, removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preprocessors.js'; 62import {albanianTransforms} from './sq/albanian-transforms.js'; 63import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js'; 64import {tagalogTransforms} from './tl/tagalog-transforms.js'; 65import {normalizeDiacritics} from './vi/viet-text-preprocessors.js'; 66import {convertFinalLetters, convertYiddishLigatures} from './yi/yiddish-text-postprocessors.js'; 67import {combineYiddishLigatures, removeYiddishDiacritics} from './yi/yiddish-text-preprocessors.js'; 68import {yiddishTransforms} from './yi/yiddish-transforms.js'; 69import {isStringPartiallyChinese, normalizePinyin} from './zh/chinese.js'; 70 71const capitalizationPreprocessors = { 72 decapitalize, 73 capitalizeFirstLetter, 74}; 75 76/** @type {import('language-descriptors').LanguageDescriptorAny[]} */ 77const languageDescriptors = [ 78 { 79 iso: 'aii', 80 iso639_3: 'aii', 81 name: 'Assyrian Neo-Aramaic', 82 exampleText: 'ܟܵܬܹܒ݂', 83 textPreprocessors: { 84 removeSyriacScriptDiacritics, 85 }, 86 }, 87 { 88 iso: 'ar', 89 iso639_3: 'ara', 90 name: 'Arabic (MSA)', 91 exampleText: 'قَرَأَ', 92 textPreprocessors: { 93 removeArabicScriptDiacritics, 94 removeTatweel, 95 normalizeUnicode, 96 addHamzaTop, 97 addHamzaBottom, 98 convertAlifMaqsuraToYaa, 99 }, 100 languageTransforms: arabicTransforms, 101 }, 102 { 103 iso: 'arz', 104 iso639_3: 'arz', 105 name: 'Arabic (Egyptian)', 106 exampleText: 'قَرَأَ', 107 textPreprocessors: { 108 removeArabicScriptDiacritics, 109 removeTatweel, 110 normalizeUnicode, 111 addHamzaTop, 112 addHamzaBottom, 113 convertAlifMaqsuraToYaa, 114 convertHaToTaMarbuta, 115 }, 116 languageTransforms: arabicTransforms, 117 }, 118 { 119 iso: 'be', 120 iso639_3: 'bel', 121 name: 'Belarusian', 122 exampleText: 'чытаць', 123 textPreprocessors: capitalizationPreprocessors, 124 }, 125 { 126 iso: 'bg', 127 iso639_3: 'bul', 128 name: 'Bulgarian', 129 exampleText: 'чета', 130 textPreprocessors: capitalizationPreprocessors, 131 }, 132 { 133 iso: 'cs', 134 iso639_3: 'ces', 135 name: 'Czech', 136 exampleText: 'číst', 137 textPreprocessors: capitalizationPreprocessors, 138 }, 139 { 140 iso: 'da', 141 iso639_3: 'dan', 142 name: 'Danish', 143 exampleText: 'læse', 144 textPreprocessors: { 145 ...capitalizationPreprocessors, 146 }, 147 }, 148 { 149 iso: 'de', 150 iso639_3: 'deu', 151 name: 'German', 152 exampleText: 'lesen', 153 textPreprocessors: { 154 ...capitalizationPreprocessors, 155 eszettPreprocessor, 156 }, 157 languageTransforms: germanTransforms, 158 }, 159 { 160 iso: 'el', 161 iso639_3: 'ell', 162 name: 'Greek', 163 exampleText: 'διαβάζω', 164 textPreprocessors: { 165 ...capitalizationPreprocessors, 166 removeDoubleAcuteAccents, 167 }, 168 }, 169 { 170 iso: 'en', 171 iso639_3: 'eng', 172 name: 'English', 173 exampleText: 'read', 174 textPreprocessors: capitalizationPreprocessors, 175 languageTransforms: englishTransforms, 176 }, 177 { 178 iso: 'eo', 179 iso639_3: 'epo', 180 name: 'Esperanto', 181 exampleText: 'legi', 182 textPreprocessors: capitalizationPreprocessors, 183 languageTransforms: esperantoTransforms, 184 }, 185 { 186 iso: 'es', 187 iso639_3: 'spa', 188 name: 'Spanish', 189 exampleText: 'leer', 190 textPreprocessors: capitalizationPreprocessors, 191 languageTransforms: spanishTransforms, 192 }, 193 { 194 iso: 'et', 195 iso639_3: 'est', 196 name: 'Estonian', 197 exampleText: 'lugema', 198 textPreprocessors: capitalizationPreprocessors, 199 }, 200 { 201 iso: 'fa', 202 iso639_3: 'fas', 203 name: 'Persian', 204 exampleText: 'خواندن', 205 textPreprocessors: { 206 removeArabicScriptDiacritics, 207 }, 208 }, 209 { 210 iso: 'fi', 211 iso639_3: 'fin', 212 name: 'Finnish', 213 exampleText: 'lukea', 214 textPreprocessors: capitalizationPreprocessors, 215 }, 216 { 217 iso: 'fr', 218 iso639_3: 'fra', 219 name: 'French', 220 exampleText: 'lire', 221 textPreprocessors: { 222 ...capitalizationPreprocessors, 223 apostropheVariants, 224 }, 225 languageTransforms: frenchTransforms, 226 }, 227 { 228 iso: 'ga', 229 iso639_3: 'gle', 230 name: 'Irish', 231 exampleText: 'léigh', 232 textPreprocessors: capitalizationPreprocessors, 233 languageTransforms: irishTransforms, 234 }, 235 { 236 iso: 'grc', 237 iso639_3: 'grc', 238 name: 'Ancient Greek', 239 exampleText: 'γράφω', /* 'to write' */ 240 textPreprocessors: { 241 ...capitalizationPreprocessors, 242 removeAlphabeticDiacritics, 243 convertLatinToGreek, 244 }, 245 languageTransforms: ancientGreekTransforms, 246 }, 247 { 248 // no 2 letter iso for hawaiian 249 iso: 'haw', 250 iso639_3: 'haw', 251 name: 'Hawaiian', 252 exampleText: 'heluhelu', 253 textPreprocessors: capitalizationPreprocessors, 254 }, 255 { 256 iso: 'he', 257 iso639_3: 'heb', 258 name: 'Hebrew', 259 exampleText: 'קריאה', 260 }, 261 { 262 iso: 'hi', 263 iso639_3: 'hin', 264 name: 'Hindi', 265 exampleText: 'पढ़ने के लिए', 266 }, 267 { 268 iso: 'hu', 269 iso639_3: 'hun', 270 name: 'Hungarian', 271 exampleText: 'olvasni', 272 textPreprocessors: capitalizationPreprocessors, 273 }, 274 { 275 iso: 'id', 276 iso639_3: 'ind', 277 name: 'Indonesian', 278 exampleText: 'baca', 279 textPreprocessors: { 280 ...capitalizationPreprocessors, 281 removeAlphabeticDiacritics, 282 }, 283 }, 284 { 285 iso: 'it', 286 iso639_3: 'ita', 287 name: 'Italian', 288 exampleText: 'leggere', 289 textPreprocessors: { 290 ...capitalizationPreprocessors, 291 removeAlphabeticDiacritics, 292 removeApostrophedWords, 293 }, 294 }, 295 { 296 iso: 'la', 297 iso639_3: 'lat', 298 name: 'Latin', 299 exampleText: 'legō', 300 textPreprocessors: { 301 ...capitalizationPreprocessors, 302 removeAlphabeticDiacritics, 303 processDiphtongs, 304 }, 305 languageTransforms: latinTransforms, 306 }, 307 { 308 iso: 'lo', 309 iso639_3: 'lao', 310 name: 'Lao', 311 exampleText: 'ອ່ານ', 312 }, 313 { 314 iso: 'lv', 315 iso639_3: 'lav', 316 name: 'Latvian', 317 exampleText: 'lasīt', 318 textPreprocessors: capitalizationPreprocessors, 319 }, 320 { 321 iso: 'ja', 322 iso639_3: 'jpn', 323 name: 'Japanese', 324 exampleText: '読め', 325 isTextLookupWorthy: isStringPartiallyJapanese, 326 textPreprocessors: { 327 convertHalfWidthCharacters, 328 alphabeticToHiragana, 329 normalizeCombiningCharacters, 330 normalizeCJKCompatibilityCharacters, 331 normalizeRadicalCharacters, 332 alphanumericWidthVariants, 333 convertHiraganaToKatakana, 334 collapseEmphaticSequences, 335 standardizeKanji, 336 }, 337 languageTransforms: japaneseTransforms, 338 }, 339 { 340 iso: 'ka', 341 iso639_3: 'kat', 342 name: 'Georgian', 343 exampleText: 'კითხვა', // Georgian for “read” 344 languageTransforms: georgianTransforms, 345 }, 346 { 347 iso: 'kn', 348 iso639_3: 'kan', 349 name: 'Kannada', 350 exampleText: 'ಓದು', 351 }, 352 { 353 iso: 'km', 354 iso639_3: 'khm', 355 name: 'Khmer', 356 exampleText: 'អាន', 357 }, 358 { 359 iso: 'ko', 360 iso639_3: 'kor', 361 name: 'Korean', 362 exampleText: '읽어', 363 textPreprocessors: { 364 disassembleHangul, 365 }, 366 textPostprocessors: { 367 reassembleHangul, 368 }, 369 languageTransforms: koreanTransforms, 370 }, 371 { 372 iso: 'mn', 373 iso639_3: 'mon', 374 name: 'Mongolian', 375 exampleText: 'унших', 376 textPreprocessors: capitalizationPreprocessors, 377 }, 378 { 379 iso: 'mt', 380 iso639_3: 'mlt', 381 name: 'Maltese', 382 exampleText: 'kiteb', 383 textPreprocessors: capitalizationPreprocessors, 384 }, 385 { 386 iso: 'nl', 387 iso639_3: 'nld', 388 name: 'Dutch', 389 exampleText: 'lezen', 390 textPreprocessors: capitalizationPreprocessors, 391 }, 392 { 393 iso: 'no', 394 iso639_3: 'nor', 395 name: 'Norwegian', 396 exampleText: 'lese', 397 textPreprocessors: { 398 ...capitalizationPreprocessors, 399 }, 400 }, 401 { 402 iso: 'pl', 403 iso639_3: 'pol', 404 name: 'Polish', 405 exampleText: 'czytać', 406 textPreprocessors: capitalizationPreprocessors, 407 }, 408 { 409 iso: 'pt', 410 iso639_3: 'por', 411 name: 'Portuguese', 412 exampleText: 'ler', 413 textPreprocessors: capitalizationPreprocessors, 414 }, 415 { 416 iso: 'ro', 417 iso639_3: 'ron', 418 name: 'Romanian', 419 exampleText: 'citi', 420 textPreprocessors: { 421 ...capitalizationPreprocessors, 422 removeAlphabeticDiacritics, 423 }, 424 }, 425 { 426 iso: 'ru', 427 iso639_3: 'rus', 428 name: 'Russian', 429 exampleText: 'читать', 430 textPreprocessors: { 431 ...capitalizationPreprocessors, 432 yoToE, 433 removeRussianDiacritics, 434 }, 435 }, 436 { 437 iso: 'sga', 438 iso639_3: 'sga', 439 name: 'Old Irish', 440 exampleText: 'légaid', 441 textPreprocessors: { 442 ...capitalizationPreprocessors, 443 removeAlphabeticDiacritics, 444 }, 445 languageTransforms: oldIrishTransforms, 446 }, 447 { 448 iso: 'sh', 449 iso639_3: 'hbs', 450 name: 'Serbo-Croatian', 451 exampleText: 'čìtati', 452 textPreprocessors: { 453 ...capitalizationPreprocessors, 454 removeSerboCroatianAccentMarks, 455 addSerboCroatianDiacritics, 456 }, 457 }, 458 { 459 iso: 'sq', 460 iso639_3: 'sqi', 461 name: 'Albanian', 462 exampleText: 'ndihmoj', /* 'to help' */ 463 textPreprocessors: capitalizationPreprocessors, 464 languageTransforms: albanianTransforms, 465 }, 466 { 467 iso: 'sv', 468 iso639_3: 'swe', 469 name: 'Swedish', 470 exampleText: 'läsa', 471 textPreprocessors: capitalizationPreprocessors, 472 }, 473 { 474 iso: 'th', 475 iso639_3: 'tha', 476 name: 'Thai', 477 exampleText: 'อ่าน', 478 }, 479 { 480 iso: 'tl', 481 iso639_3: 'tgl', 482 name: 'Tagalog', 483 exampleText: 'basahin', 484 textPreprocessors: { 485 ...capitalizationPreprocessors, 486 removeAlphabeticDiacritics, 487 }, 488 languageTransforms: tagalogTransforms, 489 }, 490 { 491 iso: 'tr', 492 iso639_3: 'tur', 493 name: 'Turkish', 494 exampleText: 'okumak', 495 textPreprocessors: capitalizationPreprocessors, 496 }, 497 { 498 iso: 'tok', 499 iso639_3: 'tok', 500 name: 'Toki Pona', 501 exampleText: 'wile', 502 textPreprocessors: capitalizationPreprocessors, 503 }, 504 { 505 iso: 'uk', 506 iso639_3: 'ukr', 507 name: 'Ukrainian', 508 exampleText: 'читати', 509 textPreprocessors: capitalizationPreprocessors, 510 }, 511 { 512 iso: 'vi', 513 iso639_3: 'vie', 514 name: 'Vietnamese', 515 exampleText: 'đọc', 516 textPreprocessors: { 517 ...capitalizationPreprocessors, 518 normalizeDiacritics, 519 }, 520 }, 521 { 522 iso: 'cy', 523 iso639_3: 'cym', 524 name: 'Welsh', 525 exampleText: 'ddarllen', 526 textPreprocessors: capitalizationPreprocessors, 527 }, 528 { 529 iso: 'yi', 530 iso639_3: 'yid', 531 name: 'Yiddish', 532 exampleText: 'באַשאַפֿן', 533 textPreprocessors: { 534 removeYiddishDiacritics, 535 combineYiddishLigatures, 536 }, 537 textPostprocessors: { 538 convertFinalLetters, 539 convertYiddishLigatures, 540 }, 541 languageTransforms: yiddishTransforms, 542 }, 543 { 544 iso: 'yue', 545 iso639_3: 'yue', 546 name: 'Cantonese', 547 exampleText: '讀', 548 textPreprocessors: { 549 normalizeRadicalCharacters, 550 }, 551 }, 552 { 553 iso: 'zh', 554 iso639_3: 'zho', 555 name: 'Chinese', 556 exampleText: '读', 557 isTextLookupWorthy: isStringPartiallyChinese, 558 readingNormalizer: normalizePinyin, 559 textPreprocessors: { 560 normalizeRadicalCharacters, 561 }, 562 }, 563]; 564 565/** @type {Map<string, import('language-descriptors').LanguageDescriptorAny>} */ 566export const languageDescriptorMap = new Map(); 567for (const languageDescriptor of languageDescriptors) { 568 languageDescriptorMap.set(languageDescriptor.iso, languageDescriptor); 569}