A privacy-first, self-hosted, fully open source personal knowledge management software, written in typescript and golang. (PERSONAL FORK)
at lambda-fork/main 949 lines 26 kB view raw
1// SiYuan - Refactor your thinking 2// Copyright (c) 2020-present, b3log.org 3// 4// This program is free software: you can redistribute it and/or modify 5// it under the terms of the GNU Affero General Public License as published by 6// the Free Software Foundation, either version 3 of the License, or 7// (at your option) any later version. 8// 9// This program is distributed in the hope that it will be useful, 10// but WITHOUT ANY WARRANTY; without even the implied warranty of 11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12// GNU Affero General Public License for more details. 13// 14// You should have received a copy of the GNU Affero General Public License 15// along with this program. If not, see <https://www.gnu.org/licenses/>. 16 17package model 18 19import ( 20 "bytes" 21 "io/fs" 22 "os" 23 "path/filepath" 24 "runtime" 25 "strconv" 26 "strings" 27 "sync" 28 "time" 29 "unicode/utf8" 30 31 "code.sajari.com/docconv" 32 "github.com/88250/epub" 33 "github.com/88250/go-humanize" 34 "github.com/88250/gulu" 35 "github.com/88250/lute/ast" 36 "github.com/klippa-app/go-pdfium" 37 "github.com/klippa-app/go-pdfium/requests" 38 "github.com/klippa-app/go-pdfium/webassembly" 39 "github.com/siyuan-note/eventbus" 40 "github.com/siyuan-note/filelock" 41 "github.com/siyuan-note/logging" 42 "github.com/siyuan-note/siyuan/kernel/search" 43 "github.com/siyuan-note/siyuan/kernel/sql" 44 "github.com/siyuan-note/siyuan/kernel/task" 45 "github.com/siyuan-note/siyuan/kernel/util" 46 "github.com/xuri/excelize/v2" 47) 48 49type AssetContent struct { 50 ID string `json:"id"` 51 Name string `json:"name"` 52 Ext string `json:"ext"` 53 Path string `json:"path"` 54 Size int64 `json:"size"` 55 HSize string `json:"hSize"` 56 Updated int64 `json:"updated"` 57 Content string `json:"content"` 58} 59 60func GetAssetContent(id, query string, queryMethod int) (ret *AssetContent) { 61 if "" != query && (0 == queryMethod || 1 == queryMethod) { 62 if 0 == queryMethod { 63 query = stringQuery(query) 64 } 65 } 66 if !ast.IsNodeIDPattern(id) { 67 return 68 } 69 70 table := "asset_contents_fts_case_insensitive" 71 filter := " id = '" + id + "'" 72 if "" != query { 73 filter += " AND `" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'" 74 } 75 76 projections := "id, name, ext, path, size, updated, " + 77 "highlight(" + table + ", 6, '" + search.SearchMarkLeft + "', '" + search.SearchMarkRight + "') AS content" 78 stmt := "SELECT " + projections + " FROM " + table + " WHERE " + filter 79 assetContents := sql.SelectAssetContentsRawStmt(stmt, 1, 1) 80 results := fromSQLAssetContents(&assetContents, 36) 81 if 1 > len(results) { 82 return 83 } 84 ret = results[0] 85 ret.Content = strings.ReplaceAll(ret.Content, "\n", "<br>") 86 return 87} 88 89// FullTextSearchAssetContent 搜索资源文件内容。 90// 91// method:0:关键字,1:查询语法,2:SQL,3:正则表达式 92// orderBy: 0:按相关度降序,1:按相关度升序,2:按更新时间升序,3:按更新时间降序 93func FullTextSearchAssetContent(query string, types map[string]bool, method, orderBy, page, pageSize int) (ret []*AssetContent, matchedAssetCount, pageCount int) { 94 query = strings.TrimSpace(query) 95 beforeLen := 36 96 orderByClause := buildAssetContentOrderBy(orderBy) 97 switch method { 98 case 1: // 查询语法 99 filter := buildAssetContentTypeFilter(types) 100 ret, matchedAssetCount = fullTextSearchAssetContentByQuerySyntax(query, filter, orderByClause, beforeLen, page, pageSize) 101 case 2: // SQL 102 ret, matchedAssetCount = searchAssetContentBySQL(query, beforeLen, page, pageSize) 103 case 3: // 正则表达式 104 typeFilter := buildAssetContentTypeFilter(types) 105 ret, matchedAssetCount = fullTextSearchAssetContentByRegexp(query, typeFilter, orderByClause, beforeLen, page, pageSize) 106 default: // 关键字 107 filter := buildAssetContentTypeFilter(types) 108 ret, matchedAssetCount = fullTextSearchAssetContentByKeyword(query, filter, orderByClause, beforeLen, page, pageSize) 109 } 110 pageCount = (matchedAssetCount + pageSize - 1) / pageSize 111 112 if 1 > len(ret) { 113 ret = []*AssetContent{} 114 } 115 return 116} 117 118func fullTextSearchAssetContentByQuerySyntax(query, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) { 119 query = filterQueryInvisibleChars(query) 120 return fullTextSearchAssetContentByFTS(query, typeFilter, orderBy, beforeLen, page, pageSize) 121} 122 123func fullTextSearchAssetContentByKeyword(query, typeFilter string, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) { 124 query = filterQueryInvisibleChars(query) 125 query = stringQuery(query) 126 return fullTextSearchAssetContentByFTS(query, typeFilter, orderBy, beforeLen, page, pageSize) 127} 128 129func fullTextSearchAssetContentByRegexp(exp, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) { 130 exp = filterQueryInvisibleChars(exp) 131 fieldFilter := assetContentFieldRegexp(exp) 132 stmt := "SELECT * FROM `asset_contents_fts_case_insensitive` WHERE " + fieldFilter + " AND ext IN " + typeFilter 133 stmt += " " + orderBy 134 stmt += " LIMIT " + strconv.Itoa(pageSize) + " OFFSET " + strconv.Itoa((page-1)*pageSize) 135 assetContents := sql.SelectAssetContentsRawStmtNoParse(stmt, Conf.Search.Limit) 136 ret = fromSQLAssetContents(&assetContents, beforeLen) 137 if 1 > len(ret) { 138 ret = []*AssetContent{} 139 } 140 141 matchedAssetCount = fullTextSearchAssetContentCountByRegexp(exp, typeFilter) 142 return 143} 144 145func assetContentFieldRegexp(exp string) string { 146 buf := bytes.Buffer{} 147 buf.WriteString("(name REGEXP '") 148 buf.WriteString(exp) 149 buf.WriteString("' OR content REGEXP '") 150 buf.WriteString(exp) 151 buf.WriteString("')") 152 return buf.String() 153} 154 155func fullTextSearchAssetContentCountByRegexp(exp, typeFilter string) (matchedAssetCount int) { 156 table := "asset_contents_fts_case_insensitive" 157 fieldFilter := assetContentFieldRegexp(exp) 158 stmt := "SELECT COUNT(path) AS `assets` FROM `" + table + "` WHERE " + fieldFilter + " AND ext IN " + typeFilter 159 result, _ := sql.QueryAssetContentNoLimit(stmt) 160 if 1 > len(result) { 161 return 162 } 163 matchedAssetCount = int(result[0]["assets"].(int64)) 164 return 165} 166 167func fullTextSearchAssetContentByFTS(query, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) { 168 table := "asset_contents_fts_case_insensitive" 169 projections := "id, name, ext, path, size, updated, " + 170 "snippet(" + table + ", 6, '" + search.SearchMarkLeft + "', '" + search.SearchMarkRight + "', '...', 64) AS content" 171 stmt := "SELECT " + projections + " FROM " + table + " WHERE (`" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'" 172 stmt += ") AND ext IN " + typeFilter 173 stmt += " " + orderBy 174 stmt += " LIMIT " + strconv.Itoa(pageSize) + " OFFSET " + strconv.Itoa((page-1)*pageSize) 175 assetContents := sql.SelectAssetContentsRawStmt(stmt, page, pageSize) 176 ret = fromSQLAssetContents(&assetContents, beforeLen) 177 if 1 > len(ret) { 178 ret = []*AssetContent{} 179 } 180 181 matchedAssetCount = fullTextSearchAssetContentCount(query, typeFilter) 182 return 183} 184 185func searchAssetContentBySQL(stmt string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) { 186 stmt = filterQueryInvisibleChars(stmt) 187 stmt = strings.TrimSpace(stmt) 188 assetContents := sql.SelectAssetContentsRawStmt(stmt, page, pageSize) 189 ret = fromSQLAssetContents(&assetContents, beforeLen) 190 if 1 > len(ret) { 191 ret = []*AssetContent{} 192 return 193 } 194 195 stmt = strings.ToLower(stmt) 196 stmt = strings.ReplaceAll(stmt, "select * ", "select COUNT(path) AS `assets` ") 197 stmt = removeLimitClause(stmt) 198 result, _ := sql.QueryAssetContentNoLimit(stmt) 199 if 1 > len(ret) { 200 return 201 } 202 203 matchedAssetCount = int(result[0]["assets"].(int64)) 204 return 205} 206 207func fullTextSearchAssetContentCount(query, typeFilter string) (matchedAssetCount int) { 208 query = filterQueryInvisibleChars(query) 209 210 table := "asset_contents_fts_case_insensitive" 211 stmt := "SELECT COUNT(path) AS `assets` FROM `" + table + "` WHERE (`" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'" 212 stmt += ") AND ext IN " + typeFilter 213 result, _ := sql.QueryAssetContentNoLimit(stmt) 214 if 1 > len(result) { 215 return 216 } 217 matchedAssetCount = int(result[0]["assets"].(int64)) 218 return 219} 220 221func fromSQLAssetContents(assetContents *[]*sql.AssetContent, beforeLen int) (ret []*AssetContent) { 222 ret = []*AssetContent{} 223 for _, assetContent := range *assetContents { 224 ret = append(ret, fromSQLAssetContent(assetContent, beforeLen)) 225 } 226 return 227} 228 229func fromSQLAssetContent(assetContent *sql.AssetContent, beforeLen int) *AssetContent { 230 content := util.EscapeHTML(assetContent.Content) 231 if strings.Contains(content, search.SearchMarkLeft) { 232 content = strings.ReplaceAll(content, search.SearchMarkLeft, "<mark>") 233 content = strings.ReplaceAll(content, search.SearchMarkRight, "</mark>") 234 } 235 236 return &AssetContent{ 237 ID: assetContent.ID, 238 Name: assetContent.Name, 239 Ext: assetContent.Ext, 240 Path: assetContent.Path, 241 Size: assetContent.Size, 242 HSize: humanize.BytesCustomCeil(uint64(assetContent.Size), 2), 243 Updated: assetContent.Updated, 244 Content: content, 245 } 246} 247 248func buildAssetContentColumnFilter() string { 249 return "{name content}" 250} 251 252func buildAssetContentTypeFilter(types map[string]bool) string { 253 if 0 == len(types) { 254 return "" 255 } 256 257 var buf bytes.Buffer 258 buf.WriteString("(") 259 for k, enabled := range types { 260 if !enabled { 261 continue 262 } 263 264 buf.WriteString("'") 265 buf.WriteString(k) 266 buf.WriteString("',") 267 } 268 if 1 == buf.Len() { 269 buf.WriteString(")") 270 return buf.String() 271 } 272 273 buf.Truncate(buf.Len() - 1) 274 buf.WriteString(")") 275 return buf.String() 276} 277 278func buildAssetContentOrderBy(orderBy int) string { 279 switch orderBy { 280 case 0: 281 return "ORDER BY rank DESC" 282 case 1: 283 return "ORDER BY rank ASC" 284 case 2: 285 return "ORDER BY updated ASC" 286 case 3: 287 return "ORDER BY updated DESC" 288 default: 289 return "ORDER BY rank DESC" 290 } 291} 292 293var assetContentSearcher = NewAssetsSearcher() 294 295func removeIndexAssetContent(absPath string) { 296 defer logging.Recover() 297 298 assetsDir := util.GetDataAssetsAbsPath() 299 p := "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir)) 300 sql.DeleteAssetContentsByPathQueue(p) 301} 302 303func indexAssetContent(absPath string) { 304 defer logging.Recover() 305 306 ext := filepath.Ext(absPath) 307 parser := assetContentSearcher.GetParser(ext) 308 if nil == parser { 309 return 310 } 311 312 result := parser.Parse(absPath) 313 if nil == result { 314 return 315 } 316 317 info, err := os.Stat(absPath) 318 if err != nil { 319 logging.LogErrorf("stat [%s] failed: %s", absPath, err) 320 return 321 } 322 323 assetsDir := util.GetDataAssetsAbsPath() 324 p := "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir)) 325 326 assetContents := []*sql.AssetContent{ 327 { 328 ID: ast.NewNodeID(), 329 Name: util.RemoveID(filepath.Base(p)), 330 Ext: ext, 331 Path: p, 332 Size: info.Size(), 333 Updated: info.ModTime().Unix(), 334 Content: result.Content, 335 }, 336 } 337 338 sql.DeleteAssetContentsByPathQueue(p) 339 sql.IndexAssetContentsQueue(assetContents) 340} 341 342func ReindexAssetContent() { 343 task.AppendTask(task.AssetContentDatabaseIndexFull, fullReindexAssetContent) 344 return 345} 346 347func fullReindexAssetContent() { 348 util.PushMsg(Conf.Language(216), 7*1000) 349 sql.InitAssetContentDatabase(true) 350 351 assetContentSearcher.FullIndex() 352 return 353} 354 355func init() { 356 subscribeSQLAssetContentEvents() 357} 358 359func subscribeSQLAssetContentEvents() { 360 eventbus.Subscribe(util.EvtSQLAssetContentRebuild, func() { 361 ReindexAssetContent() 362 }) 363} 364 365var ( 366 AssetsSearchEnabled = true 367) 368 369type AssetsSearcher struct { 370 parsers map[string]AssetParser 371 lock *sync.Mutex 372} 373 374func (searcher *AssetsSearcher) GetParser(ext string) AssetParser { 375 searcher.lock.Lock() 376 defer searcher.lock.Unlock() 377 378 return searcher.parsers[strings.ToLower(ext)] 379} 380 381func (searcher *AssetsSearcher) FullIndex() { 382 defer logging.Recover() 383 384 assetsDir := util.GetDataAssetsAbsPath() 385 if !gulu.File.IsDir(assetsDir) { 386 return 387 } 388 389 var results []*AssetParseResult 390 filelock.Walk(assetsDir, func(absPath string, d fs.DirEntry, err error) error { 391 if err != nil { 392 logging.LogErrorf("walk dir [%s] failed: %s", absPath, err) 393 return err 394 } 395 396 if d.IsDir() { 397 return nil 398 } 399 400 ext := filepath.Ext(absPath) 401 parser := searcher.GetParser(ext) 402 if nil == parser { 403 return nil 404 } 405 406 logging.LogInfof("parsing asset content [%s]", absPath) 407 408 result := parser.Parse(absPath) 409 if nil == result { 410 return nil 411 } 412 413 info, err := d.Info() 414 if err != nil { 415 logging.LogErrorf("stat file [%s] failed: %s", absPath, err) 416 return nil 417 } 418 419 result.Path = "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir)) 420 result.Size = info.Size() 421 result.Updated = info.ModTime().Unix() 422 results = append(results, result) 423 return nil 424 }) 425 426 var assetContents []*sql.AssetContent 427 for _, result := range results { 428 assetContents = append(assetContents, &sql.AssetContent{ 429 ID: ast.NewNodeID(), 430 Name: util.RemoveID(filepath.Base(result.Path)), 431 Ext: strings.ToLower(filepath.Ext(result.Path)), 432 Path: result.Path, 433 Size: result.Size, 434 Updated: result.Updated, 435 Content: result.Content, 436 }) 437 } 438 439 sql.IndexAssetContentsQueue(assetContents) 440} 441 442func NewAssetsSearcher() *AssetsSearcher { 443 txtAssetParser := &TxtAssetParser{} 444 return &AssetsSearcher{ 445 parsers: map[string]AssetParser{ 446 ".txt": txtAssetParser, 447 ".md": txtAssetParser, 448 ".markdown": txtAssetParser, 449 ".json": txtAssetParser, 450 ".log": txtAssetParser, 451 ".sql": txtAssetParser, 452 ".html": txtAssetParser, 453 ".xml": txtAssetParser, 454 ".java": txtAssetParser, 455 ".h": txtAssetParser, 456 ".c": txtAssetParser, 457 ".cpp": txtAssetParser, 458 ".go": txtAssetParser, 459 ".rs": txtAssetParser, 460 ".swift": txtAssetParser, 461 ".kt": txtAssetParser, 462 ".py": txtAssetParser, 463 ".php": txtAssetParser, 464 ".js": txtAssetParser, 465 ".css": txtAssetParser, 466 ".ts": txtAssetParser, 467 ".sh": txtAssetParser, 468 ".bat": txtAssetParser, 469 ".cmd": txtAssetParser, 470 ".ini": txtAssetParser, 471 ".yaml": txtAssetParser, 472 ".rst": txtAssetParser, 473 ".adoc": txtAssetParser, 474 ".textile": txtAssetParser, 475 ".opml": txtAssetParser, 476 ".org": txtAssetParser, 477 ".wiki": txtAssetParser, 478 ".cs": txtAssetParser, 479 ".docx": &DocxAssetParser{}, 480 ".pptx": &PptxAssetParser{}, 481 ".xlsx": &XlsxAssetParser{}, 482 ".pdf": &PdfAssetParser{}, 483 ".epub": &EpubAssetParser{}, 484 }, 485 486 lock: &sync.Mutex{}, 487 } 488} 489 490const ( 491 TxtAssetContentMaxSize = 1024 * 1024 * 4 492 PDFAssetContentMaxPage = 1024 493) 494 495var ( 496 PDFAssetContentMaxSize uint64 = 1024 * 1024 * 128 497) 498 499type AssetParseResult struct { 500 Path string 501 Size int64 502 Updated int64 503 Content string 504} 505 506type AssetParser interface { 507 Parse(absPath string) *AssetParseResult 508} 509 510type TxtAssetParser struct { 511} 512 513func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) { 514 info, err := os.Stat(absPath) 515 if err != nil { 516 logging.LogErrorf("stat file [%s] failed: %s", absPath, err) 517 return 518 } 519 520 if TxtAssetContentMaxSize < info.Size() { 521 logging.LogWarnf("text asset [%s] is too large [%s]", absPath, humanize.BytesCustomCeil(uint64(info.Size()), 2)) 522 return 523 } 524 525 tmp := copyTempAsset(absPath) 526 if "" == tmp { 527 return 528 } 529 defer os.RemoveAll(tmp) 530 531 data, err := os.ReadFile(tmp) 532 if err != nil { 533 logging.LogErrorf("read file [%s] failed: %s", absPath, err) 534 return 535 } 536 537 if !utf8.Valid(data) { 538 // Non-UTF-8 encoded text files are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9052 539 logging.LogWarnf("text asset [%s] is not UTF-8 encoded", absPath) 540 return 541 } 542 543 content := string(data) 544 ret = &AssetParseResult{ 545 Content: content, 546 } 547 return 548} 549 550func normalizeNonTxtAssetContent(content string) (ret string) { 551 ret = strings.Join(strings.Fields(content), " ") 552 return 553} 554 555func copyTempAsset(absPath string) (ret string) { 556 dir := filepath.Join(util.TempDir, "convert", "asset_content") 557 if err := os.MkdirAll(dir, 0755); err != nil { 558 logging.LogErrorf("mkdir [%s] failed: [%s]", dir, err) 559 return 560 } 561 562 baseName := filepath.Base(absPath) 563 if strings.HasPrefix(baseName, "~") { 564 return 565 } 566 567 filelock.Lock(absPath) 568 defer filelock.Unlock(absPath) 569 570 ext := filepath.Ext(absPath) 571 ret = filepath.Join(dir, gulu.Rand.String(7)+ext) 572 if err := gulu.File.Copy(absPath, ret); err != nil { 573 logging.LogErrorf("copy [src=%s, dest=%s] failed: %s", absPath, ret, err) 574 return 575 } 576 return 577} 578 579type DocxAssetParser struct { 580} 581 582func (parser *DocxAssetParser) Parse(absPath string) (ret *AssetParseResult) { 583 if !strings.HasSuffix(strings.ToLower(absPath), ".docx") { 584 return 585 } 586 587 if !gulu.File.IsExist(absPath) { 588 return 589 } 590 591 tmp := copyTempAsset(absPath) 592 if "" == tmp { 593 return 594 } 595 defer os.RemoveAll(tmp) 596 597 f, err := os.Open(tmp) 598 if err != nil { 599 logging.LogErrorf("open [%s] failed: [%s]", tmp, err) 600 return 601 } 602 defer f.Close() 603 604 data, _, err := docconv.ConvertDocx(f) 605 if err != nil { 606 logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) 607 return 608 } 609 610 var content = normalizeNonTxtAssetContent(data) 611 ret = &AssetParseResult{ 612 Content: content, 613 } 614 return 615} 616 617type PptxAssetParser struct { 618} 619 620func (parser *PptxAssetParser) Parse(absPath string) (ret *AssetParseResult) { 621 if !strings.HasSuffix(strings.ToLower(absPath), ".pptx") { 622 return 623 } 624 625 if !gulu.File.IsExist(absPath) { 626 return 627 } 628 629 tmp := copyTempAsset(absPath) 630 if "" == tmp { 631 return 632 } 633 defer os.RemoveAll(tmp) 634 635 f, err := os.Open(tmp) 636 if err != nil { 637 logging.LogErrorf("open [%s] failed: [%s]", tmp, err) 638 return 639 } 640 defer f.Close() 641 642 data, _, err := docconv.ConvertPptx(f) 643 if err != nil { 644 logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) 645 return 646 } 647 648 var content = normalizeNonTxtAssetContent(data) 649 ret = &AssetParseResult{ 650 Content: content, 651 } 652 return 653} 654 655type XlsxAssetParser struct { 656} 657 658func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) { 659 if !strings.HasSuffix(strings.ToLower(absPath), ".xlsx") { 660 return 661 } 662 663 if !gulu.File.IsExist(absPath) { 664 return 665 } 666 667 tmp := copyTempAsset(absPath) 668 if "" == tmp { 669 return 670 } 671 defer os.RemoveAll(tmp) 672 673 x, err := excelize.OpenFile(tmp) 674 if err != nil { 675 logging.LogErrorf("open [%s] failed: [%s]", tmp, err) 676 return 677 } 678 defer x.Close() 679 680 buf := bytes.Buffer{} 681 sheetMap := x.GetSheetMap() 682 for _, sheetName := range sheetMap { 683 rows, getErr := x.GetRows(sheetName) 684 if nil != getErr { 685 logging.LogErrorf("get rows from sheet [%s] failed: [%s]", sheetName, getErr) 686 return 687 } 688 for _, row := range rows { 689 for _, colCell := range row { 690 buf.WriteString(colCell + " ") 691 } 692 } 693 } 694 695 var content = normalizeNonTxtAssetContent(buf.String()) 696 ret = &AssetParseResult{ 697 Content: content, 698 } 699 return 700} 701 702// PdfAssetParser parser factory product 703type PdfAssetParser struct { 704} 705 706// pdfPage struct defines a worker job for text extraction 707type pdfPage struct { 708 pageNo int // page number for text extraction 709 data *[]byte // pointer to PDF document data 710} 711 712// pdfTextResult struct defines the extracted PDF text result 713type pdfTextResult struct { 714 pageNo int // page number of PDF document 715 text string // text of converted page 716 err error // processing error 717} 718 719// getTextPageWorker will extract the text from a given PDF page and return its result 720func (parser *PdfAssetParser) getTextPageWorker(id int, instance pdfium.Pdfium, page <-chan *pdfPage, result chan<- *pdfTextResult) { 721 defer instance.Close() 722 for pd := range page { 723 doc, err := instance.OpenDocument(&requests.OpenDocument{ 724 File: pd.data, 725 }) 726 if err != nil { 727 instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ 728 Document: doc.Document, 729 }) 730 result <- &pdfTextResult{ 731 pageNo: pd.pageNo, 732 err: err, 733 } 734 continue 735 } 736 737 req := &requests.GetPageText{ 738 Page: requests.Page{ 739 ByIndex: &requests.PageByIndex{ 740 Document: doc.Document, 741 Index: pd.pageNo, 742 }, 743 }, 744 } 745 res, err := instance.GetPageText(req) 746 if err != nil { 747 instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ 748 Document: doc.Document, 749 }) 750 result <- &pdfTextResult{ 751 pageNo: pd.pageNo, 752 err: err, 753 } 754 continue 755 } 756 instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ 757 Document: doc.Document, 758 }) 759 result <- &pdfTextResult{ 760 pageNo: pd.pageNo, 761 text: res.Text, 762 err: nil, 763 } 764 } 765} 766 767// Parse will parse a PDF document using PDFium webassembly module using a worker pool 768func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { 769 if util.ContainerIOS == util.Container || util.ContainerAndroid == util.Container || util.ContainerHarmony == util.Container { 770 // PDF asset content searching is not supported on mobile platforms 771 return 772 } 773 774 now := time.Now() 775 if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") { 776 return 777 } 778 779 if !gulu.File.IsExist(absPath) { 780 return 781 } 782 783 tmp := copyTempAsset(absPath) 784 if "" == tmp { 785 return 786 } 787 defer os.RemoveAll(tmp) 788 789 // PDF blob will be processed in-memory making sharing of PDF document data across worker goroutines possible 790 pdfData, err := os.ReadFile(tmp) 791 if err != nil { 792 logging.LogErrorf("open [%s] failed: [%s]", tmp, err) 793 return 794 } 795 796 // initialize go-pdfium with number of available cores 797 // we fire up the complete worker pool for maximum performance 798 cores := runtime.NumCPU() 799 if 4 < cores { 800 cores = 4 // Limit memory usage 801 } 802 803 pool, err := webassembly.Init(webassembly.Config{ 804 MinIdle: cores, 805 MaxIdle: cores, 806 MaxTotal: cores, 807 }) 808 if err != nil { 809 logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) 810 return 811 } 812 defer pool.Close() 813 814 // first get the number of PDF pages to convert into text 815 instance, err := pool.GetInstance(time.Second * 30) 816 if err != nil { 817 logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) 818 return 819 } 820 doc, err := instance.OpenDocument(&requests.OpenDocument{ 821 File: &pdfData, 822 }) 823 if err != nil { 824 instance.Close() 825 logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) 826 return 827 } 828 pc, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document}) 829 if err != nil { 830 instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ 831 Document: doc.Document, 832 }) 833 instance.Close() 834 logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) 835 return 836 } 837 instance.Close() 838 839 if PDFAssetContentMaxPage < pc.PageCount { 840 // PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053 841 logging.LogWarnf("ignore large PDF asset [%s] with [%d] pages", absPath, pc.PageCount) 842 return 843 } 844 845 if maxSizeVal := os.Getenv("SIYUAN_PDF_ASSET_CONTENT_INDEX_MAX_SIZE"); "" != maxSizeVal { 846 if maxSize, parseErr := strconv.ParseUint(maxSizeVal, 10, 64); nil == parseErr { 847 if maxSize != PDFAssetContentMaxSize { 848 PDFAssetContentMaxSize = maxSize 849 logging.LogInfof("set PDF asset content index max size to [%s]", humanize.BytesCustomCeil(maxSize, 2)) 850 } 851 } else { 852 logging.LogWarnf("invalid env [SIYUAN_PDF_ASSET_CONTENT_INDEX_MAX_SIZE]: [%s], parsing failed: %s", maxSizeVal, parseErr) 853 } 854 } 855 856 if PDFAssetContentMaxSize < uint64(len(pdfData)) { 857 // PDF files larger than 128MB are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9500 858 logging.LogWarnf("ignore large PDF asset [%s] with [%s]", absPath, humanize.BytesCustomCeil(uint64(len(pdfData)), 2)) 859 return 860 } 861 862 // next setup worker pool for processing PDF pages 863 pages := make(chan *pdfPage, pc.PageCount) 864 results := make(chan *pdfTextResult, pc.PageCount) 865 for i := 0; i < cores; i++ { 866 inst, err := pool.GetInstance(time.Second * 30) 867 if err != nil { 868 close(pages) 869 close(results) 870 logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) 871 return 872 } 873 go parser.getTextPageWorker(i, inst, pages, results) 874 } 875 876 // now split pages and let them process by worker pool 877 for p := 0; p < pc.PageCount; p++ { 878 pages <- &pdfPage{ 879 pageNo: p, 880 data: &pdfData, 881 } 882 } 883 close(pages) 884 885 // finally fetch the PDF page text results 886 // Note: some workers will process pages faster than other workers depending on the page contents 887 // the order of returned PDF text pages is random and must be sorted using the pageNo index 888 pageText := make([]string, pc.PageCount) 889 for p := 0; p < pc.PageCount; p++ { 890 res := <-results 891 pageText[res.pageNo] = res.text 892 if nil != res.err { 893 logging.LogErrorf("convert [%s] of page %d failed: [%s]", tmp, res.pageNo, res.err) 894 } 895 } 896 close(results) 897 898 if 128 < pc.PageCount { 899 logging.LogInfof("convert [%s] PDF with [%d] pages using [%d] workers took [%s]", absPath, pc.PageCount, cores, time.Since(now)) 900 } 901 902 // loop through ordered PDF text pages and join content for asset parse DB result 903 contentBuilder := bytes.Buffer{} 904 for _, pt := range pageText { 905 contentBuilder.WriteString(" " + normalizeNonTxtAssetContent(pt)) 906 } 907 ret = &AssetParseResult{ 908 Content: contentBuilder.String(), 909 } 910 return 911} 912 913type EpubAssetParser struct { 914} 915 916func (parser *EpubAssetParser) Parse(absPath string) (ret *AssetParseResult) { 917 if !strings.HasSuffix(strings.ToLower(absPath), ".epub") { 918 return 919 } 920 921 if !gulu.File.IsExist(absPath) { 922 return 923 } 924 925 tmp := copyTempAsset(absPath) 926 if "" == tmp { 927 return 928 } 929 defer os.RemoveAll(tmp) 930 931 f, err := os.Open(tmp) 932 if err != nil { 933 logging.LogErrorf("open [%s] failed: [%s]", tmp, err) 934 return 935 } 936 defer f.Close() 937 938 buf := bytes.Buffer{} 939 if err = epub.ToTxt(tmp, &buf); err != nil { 940 logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) 941 return 942 } 943 944 content := normalizeNonTxtAssetContent(buf.String()) 945 ret = &AssetParseResult{ 946 Content: content, 947 } 948 return 949}