A privacy-first, self-hosted, fully open source personal knowledge management software, written in typescript and golang. (PERSONAL FORK)
1// SiYuan - Refactor your thinking
2// Copyright (c) 2020-present, b3log.org
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU Affero General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU Affero General Public License for more details.
13//
14// You should have received a copy of the GNU Affero General Public License
15// along with this program. If not, see <https://www.gnu.org/licenses/>.
16
17package model
18
19import (
20 "bytes"
21 "io/fs"
22 "os"
23 "path/filepath"
24 "runtime"
25 "strconv"
26 "strings"
27 "sync"
28 "time"
29 "unicode/utf8"
30
31 "code.sajari.com/docconv"
32 "github.com/88250/epub"
33 "github.com/88250/go-humanize"
34 "github.com/88250/gulu"
35 "github.com/88250/lute/ast"
36 "github.com/klippa-app/go-pdfium"
37 "github.com/klippa-app/go-pdfium/requests"
38 "github.com/klippa-app/go-pdfium/webassembly"
39 "github.com/siyuan-note/eventbus"
40 "github.com/siyuan-note/filelock"
41 "github.com/siyuan-note/logging"
42 "github.com/siyuan-note/siyuan/kernel/search"
43 "github.com/siyuan-note/siyuan/kernel/sql"
44 "github.com/siyuan-note/siyuan/kernel/task"
45 "github.com/siyuan-note/siyuan/kernel/util"
46 "github.com/xuri/excelize/v2"
47)
48
49type AssetContent struct {
50 ID string `json:"id"`
51 Name string `json:"name"`
52 Ext string `json:"ext"`
53 Path string `json:"path"`
54 Size int64 `json:"size"`
55 HSize string `json:"hSize"`
56 Updated int64 `json:"updated"`
57 Content string `json:"content"`
58}
59
60func GetAssetContent(id, query string, queryMethod int) (ret *AssetContent) {
61 if "" != query && (0 == queryMethod || 1 == queryMethod) {
62 if 0 == queryMethod {
63 query = stringQuery(query)
64 }
65 }
66 if !ast.IsNodeIDPattern(id) {
67 return
68 }
69
70 table := "asset_contents_fts_case_insensitive"
71 filter := " id = '" + id + "'"
72 if "" != query {
73 filter += " AND `" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'"
74 }
75
76 projections := "id, name, ext, path, size, updated, " +
77 "highlight(" + table + ", 6, '" + search.SearchMarkLeft + "', '" + search.SearchMarkRight + "') AS content"
78 stmt := "SELECT " + projections + " FROM " + table + " WHERE " + filter
79 assetContents := sql.SelectAssetContentsRawStmt(stmt, 1, 1)
80 results := fromSQLAssetContents(&assetContents, 36)
81 if 1 > len(results) {
82 return
83 }
84 ret = results[0]
85 ret.Content = strings.ReplaceAll(ret.Content, "\n", "<br>")
86 return
87}
88
89// FullTextSearchAssetContent 搜索资源文件内容。
90//
91// method:0:关键字,1:查询语法,2:SQL,3:正则表达式
92// orderBy: 0:按相关度降序,1:按相关度升序,2:按更新时间升序,3:按更新时间降序
93func FullTextSearchAssetContent(query string, types map[string]bool, method, orderBy, page, pageSize int) (ret []*AssetContent, matchedAssetCount, pageCount int) {
94 query = strings.TrimSpace(query)
95 beforeLen := 36
96 orderByClause := buildAssetContentOrderBy(orderBy)
97 switch method {
98 case 1: // 查询语法
99 filter := buildAssetContentTypeFilter(types)
100 ret, matchedAssetCount = fullTextSearchAssetContentByQuerySyntax(query, filter, orderByClause, beforeLen, page, pageSize)
101 case 2: // SQL
102 ret, matchedAssetCount = searchAssetContentBySQL(query, beforeLen, page, pageSize)
103 case 3: // 正则表达式
104 typeFilter := buildAssetContentTypeFilter(types)
105 ret, matchedAssetCount = fullTextSearchAssetContentByRegexp(query, typeFilter, orderByClause, beforeLen, page, pageSize)
106 default: // 关键字
107 filter := buildAssetContentTypeFilter(types)
108 ret, matchedAssetCount = fullTextSearchAssetContentByKeyword(query, filter, orderByClause, beforeLen, page, pageSize)
109 }
110 pageCount = (matchedAssetCount + pageSize - 1) / pageSize
111
112 if 1 > len(ret) {
113 ret = []*AssetContent{}
114 }
115 return
116}
117
118func fullTextSearchAssetContentByQuerySyntax(query, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
119 query = filterQueryInvisibleChars(query)
120 return fullTextSearchAssetContentByFTS(query, typeFilter, orderBy, beforeLen, page, pageSize)
121}
122
123func fullTextSearchAssetContentByKeyword(query, typeFilter string, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
124 query = filterQueryInvisibleChars(query)
125 query = stringQuery(query)
126 return fullTextSearchAssetContentByFTS(query, typeFilter, orderBy, beforeLen, page, pageSize)
127}
128
129func fullTextSearchAssetContentByRegexp(exp, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
130 exp = filterQueryInvisibleChars(exp)
131 fieldFilter := assetContentFieldRegexp(exp)
132 stmt := "SELECT * FROM `asset_contents_fts_case_insensitive` WHERE " + fieldFilter + " AND ext IN " + typeFilter
133 stmt += " " + orderBy
134 stmt += " LIMIT " + strconv.Itoa(pageSize) + " OFFSET " + strconv.Itoa((page-1)*pageSize)
135 assetContents := sql.SelectAssetContentsRawStmtNoParse(stmt, Conf.Search.Limit)
136 ret = fromSQLAssetContents(&assetContents, beforeLen)
137 if 1 > len(ret) {
138 ret = []*AssetContent{}
139 }
140
141 matchedAssetCount = fullTextSearchAssetContentCountByRegexp(exp, typeFilter)
142 return
143}
144
145func assetContentFieldRegexp(exp string) string {
146 buf := bytes.Buffer{}
147 buf.WriteString("(name REGEXP '")
148 buf.WriteString(exp)
149 buf.WriteString("' OR content REGEXP '")
150 buf.WriteString(exp)
151 buf.WriteString("')")
152 return buf.String()
153}
154
155func fullTextSearchAssetContentCountByRegexp(exp, typeFilter string) (matchedAssetCount int) {
156 table := "asset_contents_fts_case_insensitive"
157 fieldFilter := assetContentFieldRegexp(exp)
158 stmt := "SELECT COUNT(path) AS `assets` FROM `" + table + "` WHERE " + fieldFilter + " AND ext IN " + typeFilter
159 result, _ := sql.QueryAssetContentNoLimit(stmt)
160 if 1 > len(result) {
161 return
162 }
163 matchedAssetCount = int(result[0]["assets"].(int64))
164 return
165}
166
167func fullTextSearchAssetContentByFTS(query, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
168 table := "asset_contents_fts_case_insensitive"
169 projections := "id, name, ext, path, size, updated, " +
170 "snippet(" + table + ", 6, '" + search.SearchMarkLeft + "', '" + search.SearchMarkRight + "', '...', 64) AS content"
171 stmt := "SELECT " + projections + " FROM " + table + " WHERE (`" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'"
172 stmt += ") AND ext IN " + typeFilter
173 stmt += " " + orderBy
174 stmt += " LIMIT " + strconv.Itoa(pageSize) + " OFFSET " + strconv.Itoa((page-1)*pageSize)
175 assetContents := sql.SelectAssetContentsRawStmt(stmt, page, pageSize)
176 ret = fromSQLAssetContents(&assetContents, beforeLen)
177 if 1 > len(ret) {
178 ret = []*AssetContent{}
179 }
180
181 matchedAssetCount = fullTextSearchAssetContentCount(query, typeFilter)
182 return
183}
184
185func searchAssetContentBySQL(stmt string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
186 stmt = filterQueryInvisibleChars(stmt)
187 stmt = strings.TrimSpace(stmt)
188 assetContents := sql.SelectAssetContentsRawStmt(stmt, page, pageSize)
189 ret = fromSQLAssetContents(&assetContents, beforeLen)
190 if 1 > len(ret) {
191 ret = []*AssetContent{}
192 return
193 }
194
195 stmt = strings.ToLower(stmt)
196 stmt = strings.ReplaceAll(stmt, "select * ", "select COUNT(path) AS `assets` ")
197 stmt = removeLimitClause(stmt)
198 result, _ := sql.QueryAssetContentNoLimit(stmt)
199 if 1 > len(ret) {
200 return
201 }
202
203 matchedAssetCount = int(result[0]["assets"].(int64))
204 return
205}
206
207func fullTextSearchAssetContentCount(query, typeFilter string) (matchedAssetCount int) {
208 query = filterQueryInvisibleChars(query)
209
210 table := "asset_contents_fts_case_insensitive"
211 stmt := "SELECT COUNT(path) AS `assets` FROM `" + table + "` WHERE (`" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'"
212 stmt += ") AND ext IN " + typeFilter
213 result, _ := sql.QueryAssetContentNoLimit(stmt)
214 if 1 > len(result) {
215 return
216 }
217 matchedAssetCount = int(result[0]["assets"].(int64))
218 return
219}
220
221func fromSQLAssetContents(assetContents *[]*sql.AssetContent, beforeLen int) (ret []*AssetContent) {
222 ret = []*AssetContent{}
223 for _, assetContent := range *assetContents {
224 ret = append(ret, fromSQLAssetContent(assetContent, beforeLen))
225 }
226 return
227}
228
229func fromSQLAssetContent(assetContent *sql.AssetContent, beforeLen int) *AssetContent {
230 content := util.EscapeHTML(assetContent.Content)
231 if strings.Contains(content, search.SearchMarkLeft) {
232 content = strings.ReplaceAll(content, search.SearchMarkLeft, "<mark>")
233 content = strings.ReplaceAll(content, search.SearchMarkRight, "</mark>")
234 }
235
236 return &AssetContent{
237 ID: assetContent.ID,
238 Name: assetContent.Name,
239 Ext: assetContent.Ext,
240 Path: assetContent.Path,
241 Size: assetContent.Size,
242 HSize: humanize.BytesCustomCeil(uint64(assetContent.Size), 2),
243 Updated: assetContent.Updated,
244 Content: content,
245 }
246}
247
248func buildAssetContentColumnFilter() string {
249 return "{name content}"
250}
251
252func buildAssetContentTypeFilter(types map[string]bool) string {
253 if 0 == len(types) {
254 return ""
255 }
256
257 var buf bytes.Buffer
258 buf.WriteString("(")
259 for k, enabled := range types {
260 if !enabled {
261 continue
262 }
263
264 buf.WriteString("'")
265 buf.WriteString(k)
266 buf.WriteString("',")
267 }
268 if 1 == buf.Len() {
269 buf.WriteString(")")
270 return buf.String()
271 }
272
273 buf.Truncate(buf.Len() - 1)
274 buf.WriteString(")")
275 return buf.String()
276}
277
278func buildAssetContentOrderBy(orderBy int) string {
279 switch orderBy {
280 case 0:
281 return "ORDER BY rank DESC"
282 case 1:
283 return "ORDER BY rank ASC"
284 case 2:
285 return "ORDER BY updated ASC"
286 case 3:
287 return "ORDER BY updated DESC"
288 default:
289 return "ORDER BY rank DESC"
290 }
291}
292
293var assetContentSearcher = NewAssetsSearcher()
294
295func removeIndexAssetContent(absPath string) {
296 defer logging.Recover()
297
298 assetsDir := util.GetDataAssetsAbsPath()
299 p := "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir))
300 sql.DeleteAssetContentsByPathQueue(p)
301}
302
303func indexAssetContent(absPath string) {
304 defer logging.Recover()
305
306 ext := filepath.Ext(absPath)
307 parser := assetContentSearcher.GetParser(ext)
308 if nil == parser {
309 return
310 }
311
312 result := parser.Parse(absPath)
313 if nil == result {
314 return
315 }
316
317 info, err := os.Stat(absPath)
318 if err != nil {
319 logging.LogErrorf("stat [%s] failed: %s", absPath, err)
320 return
321 }
322
323 assetsDir := util.GetDataAssetsAbsPath()
324 p := "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir))
325
326 assetContents := []*sql.AssetContent{
327 {
328 ID: ast.NewNodeID(),
329 Name: util.RemoveID(filepath.Base(p)),
330 Ext: ext,
331 Path: p,
332 Size: info.Size(),
333 Updated: info.ModTime().Unix(),
334 Content: result.Content,
335 },
336 }
337
338 sql.DeleteAssetContentsByPathQueue(p)
339 sql.IndexAssetContentsQueue(assetContents)
340}
341
342func ReindexAssetContent() {
343 task.AppendTask(task.AssetContentDatabaseIndexFull, fullReindexAssetContent)
344 return
345}
346
347func fullReindexAssetContent() {
348 util.PushMsg(Conf.Language(216), 7*1000)
349 sql.InitAssetContentDatabase(true)
350
351 assetContentSearcher.FullIndex()
352 return
353}
354
355func init() {
356 subscribeSQLAssetContentEvents()
357}
358
359func subscribeSQLAssetContentEvents() {
360 eventbus.Subscribe(util.EvtSQLAssetContentRebuild, func() {
361 ReindexAssetContent()
362 })
363}
364
365var (
366 AssetsSearchEnabled = true
367)
368
369type AssetsSearcher struct {
370 parsers map[string]AssetParser
371 lock *sync.Mutex
372}
373
374func (searcher *AssetsSearcher) GetParser(ext string) AssetParser {
375 searcher.lock.Lock()
376 defer searcher.lock.Unlock()
377
378 return searcher.parsers[strings.ToLower(ext)]
379}
380
381func (searcher *AssetsSearcher) FullIndex() {
382 defer logging.Recover()
383
384 assetsDir := util.GetDataAssetsAbsPath()
385 if !gulu.File.IsDir(assetsDir) {
386 return
387 }
388
389 var results []*AssetParseResult
390 filelock.Walk(assetsDir, func(absPath string, d fs.DirEntry, err error) error {
391 if err != nil {
392 logging.LogErrorf("walk dir [%s] failed: %s", absPath, err)
393 return err
394 }
395
396 if d.IsDir() {
397 return nil
398 }
399
400 ext := filepath.Ext(absPath)
401 parser := searcher.GetParser(ext)
402 if nil == parser {
403 return nil
404 }
405
406 logging.LogInfof("parsing asset content [%s]", absPath)
407
408 result := parser.Parse(absPath)
409 if nil == result {
410 return nil
411 }
412
413 info, err := d.Info()
414 if err != nil {
415 logging.LogErrorf("stat file [%s] failed: %s", absPath, err)
416 return nil
417 }
418
419 result.Path = "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir))
420 result.Size = info.Size()
421 result.Updated = info.ModTime().Unix()
422 results = append(results, result)
423 return nil
424 })
425
426 var assetContents []*sql.AssetContent
427 for _, result := range results {
428 assetContents = append(assetContents, &sql.AssetContent{
429 ID: ast.NewNodeID(),
430 Name: util.RemoveID(filepath.Base(result.Path)),
431 Ext: strings.ToLower(filepath.Ext(result.Path)),
432 Path: result.Path,
433 Size: result.Size,
434 Updated: result.Updated,
435 Content: result.Content,
436 })
437 }
438
439 sql.IndexAssetContentsQueue(assetContents)
440}
441
442func NewAssetsSearcher() *AssetsSearcher {
443 txtAssetParser := &TxtAssetParser{}
444 return &AssetsSearcher{
445 parsers: map[string]AssetParser{
446 ".txt": txtAssetParser,
447 ".md": txtAssetParser,
448 ".markdown": txtAssetParser,
449 ".json": txtAssetParser,
450 ".log": txtAssetParser,
451 ".sql": txtAssetParser,
452 ".html": txtAssetParser,
453 ".xml": txtAssetParser,
454 ".java": txtAssetParser,
455 ".h": txtAssetParser,
456 ".c": txtAssetParser,
457 ".cpp": txtAssetParser,
458 ".go": txtAssetParser,
459 ".rs": txtAssetParser,
460 ".swift": txtAssetParser,
461 ".kt": txtAssetParser,
462 ".py": txtAssetParser,
463 ".php": txtAssetParser,
464 ".js": txtAssetParser,
465 ".css": txtAssetParser,
466 ".ts": txtAssetParser,
467 ".sh": txtAssetParser,
468 ".bat": txtAssetParser,
469 ".cmd": txtAssetParser,
470 ".ini": txtAssetParser,
471 ".yaml": txtAssetParser,
472 ".rst": txtAssetParser,
473 ".adoc": txtAssetParser,
474 ".textile": txtAssetParser,
475 ".opml": txtAssetParser,
476 ".org": txtAssetParser,
477 ".wiki": txtAssetParser,
478 ".cs": txtAssetParser,
479 ".docx": &DocxAssetParser{},
480 ".pptx": &PptxAssetParser{},
481 ".xlsx": &XlsxAssetParser{},
482 ".pdf": &PdfAssetParser{},
483 ".epub": &EpubAssetParser{},
484 },
485
486 lock: &sync.Mutex{},
487 }
488}
489
490const (
491 TxtAssetContentMaxSize = 1024 * 1024 * 4
492 PDFAssetContentMaxPage = 1024
493)
494
495var (
496 PDFAssetContentMaxSize uint64 = 1024 * 1024 * 128
497)
498
499type AssetParseResult struct {
500 Path string
501 Size int64
502 Updated int64
503 Content string
504}
505
506type AssetParser interface {
507 Parse(absPath string) *AssetParseResult
508}
509
510type TxtAssetParser struct {
511}
512
513func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) {
514 info, err := os.Stat(absPath)
515 if err != nil {
516 logging.LogErrorf("stat file [%s] failed: %s", absPath, err)
517 return
518 }
519
520 if TxtAssetContentMaxSize < info.Size() {
521 logging.LogWarnf("text asset [%s] is too large [%s]", absPath, humanize.BytesCustomCeil(uint64(info.Size()), 2))
522 return
523 }
524
525 tmp := copyTempAsset(absPath)
526 if "" == tmp {
527 return
528 }
529 defer os.RemoveAll(tmp)
530
531 data, err := os.ReadFile(tmp)
532 if err != nil {
533 logging.LogErrorf("read file [%s] failed: %s", absPath, err)
534 return
535 }
536
537 if !utf8.Valid(data) {
538 // Non-UTF-8 encoded text files are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9052
539 logging.LogWarnf("text asset [%s] is not UTF-8 encoded", absPath)
540 return
541 }
542
543 content := string(data)
544 ret = &AssetParseResult{
545 Content: content,
546 }
547 return
548}
549
550func normalizeNonTxtAssetContent(content string) (ret string) {
551 ret = strings.Join(strings.Fields(content), " ")
552 return
553}
554
555func copyTempAsset(absPath string) (ret string) {
556 dir := filepath.Join(util.TempDir, "convert", "asset_content")
557 if err := os.MkdirAll(dir, 0755); err != nil {
558 logging.LogErrorf("mkdir [%s] failed: [%s]", dir, err)
559 return
560 }
561
562 baseName := filepath.Base(absPath)
563 if strings.HasPrefix(baseName, "~") {
564 return
565 }
566
567 filelock.Lock(absPath)
568 defer filelock.Unlock(absPath)
569
570 ext := filepath.Ext(absPath)
571 ret = filepath.Join(dir, gulu.Rand.String(7)+ext)
572 if err := gulu.File.Copy(absPath, ret); err != nil {
573 logging.LogErrorf("copy [src=%s, dest=%s] failed: %s", absPath, ret, err)
574 return
575 }
576 return
577}
578
579type DocxAssetParser struct {
580}
581
582func (parser *DocxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
583 if !strings.HasSuffix(strings.ToLower(absPath), ".docx") {
584 return
585 }
586
587 if !gulu.File.IsExist(absPath) {
588 return
589 }
590
591 tmp := copyTempAsset(absPath)
592 if "" == tmp {
593 return
594 }
595 defer os.RemoveAll(tmp)
596
597 f, err := os.Open(tmp)
598 if err != nil {
599 logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
600 return
601 }
602 defer f.Close()
603
604 data, _, err := docconv.ConvertDocx(f)
605 if err != nil {
606 logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
607 return
608 }
609
610 var content = normalizeNonTxtAssetContent(data)
611 ret = &AssetParseResult{
612 Content: content,
613 }
614 return
615}
616
617type PptxAssetParser struct {
618}
619
620func (parser *PptxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
621 if !strings.HasSuffix(strings.ToLower(absPath), ".pptx") {
622 return
623 }
624
625 if !gulu.File.IsExist(absPath) {
626 return
627 }
628
629 tmp := copyTempAsset(absPath)
630 if "" == tmp {
631 return
632 }
633 defer os.RemoveAll(tmp)
634
635 f, err := os.Open(tmp)
636 if err != nil {
637 logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
638 return
639 }
640 defer f.Close()
641
642 data, _, err := docconv.ConvertPptx(f)
643 if err != nil {
644 logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
645 return
646 }
647
648 var content = normalizeNonTxtAssetContent(data)
649 ret = &AssetParseResult{
650 Content: content,
651 }
652 return
653}
654
655type XlsxAssetParser struct {
656}
657
658func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
659 if !strings.HasSuffix(strings.ToLower(absPath), ".xlsx") {
660 return
661 }
662
663 if !gulu.File.IsExist(absPath) {
664 return
665 }
666
667 tmp := copyTempAsset(absPath)
668 if "" == tmp {
669 return
670 }
671 defer os.RemoveAll(tmp)
672
673 x, err := excelize.OpenFile(tmp)
674 if err != nil {
675 logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
676 return
677 }
678 defer x.Close()
679
680 buf := bytes.Buffer{}
681 sheetMap := x.GetSheetMap()
682 for _, sheetName := range sheetMap {
683 rows, getErr := x.GetRows(sheetName)
684 if nil != getErr {
685 logging.LogErrorf("get rows from sheet [%s] failed: [%s]", sheetName, getErr)
686 return
687 }
688 for _, row := range rows {
689 for _, colCell := range row {
690 buf.WriteString(colCell + " ")
691 }
692 }
693 }
694
695 var content = normalizeNonTxtAssetContent(buf.String())
696 ret = &AssetParseResult{
697 Content: content,
698 }
699 return
700}
701
702// PdfAssetParser parser factory product
703type PdfAssetParser struct {
704}
705
706// pdfPage struct defines a worker job for text extraction
707type pdfPage struct {
708 pageNo int // page number for text extraction
709 data *[]byte // pointer to PDF document data
710}
711
712// pdfTextResult struct defines the extracted PDF text result
713type pdfTextResult struct {
714 pageNo int // page number of PDF document
715 text string // text of converted page
716 err error // processing error
717}
718
719// getTextPageWorker will extract the text from a given PDF page and return its result
720func (parser *PdfAssetParser) getTextPageWorker(id int, instance pdfium.Pdfium, page <-chan *pdfPage, result chan<- *pdfTextResult) {
721 defer instance.Close()
722 for pd := range page {
723 doc, err := instance.OpenDocument(&requests.OpenDocument{
724 File: pd.data,
725 })
726 if err != nil {
727 instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
728 Document: doc.Document,
729 })
730 result <- &pdfTextResult{
731 pageNo: pd.pageNo,
732 err: err,
733 }
734 continue
735 }
736
737 req := &requests.GetPageText{
738 Page: requests.Page{
739 ByIndex: &requests.PageByIndex{
740 Document: doc.Document,
741 Index: pd.pageNo,
742 },
743 },
744 }
745 res, err := instance.GetPageText(req)
746 if err != nil {
747 instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
748 Document: doc.Document,
749 })
750 result <- &pdfTextResult{
751 pageNo: pd.pageNo,
752 err: err,
753 }
754 continue
755 }
756 instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
757 Document: doc.Document,
758 })
759 result <- &pdfTextResult{
760 pageNo: pd.pageNo,
761 text: res.Text,
762 err: nil,
763 }
764 }
765}
766
767// Parse will parse a PDF document using PDFium webassembly module using a worker pool
768func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
769 if util.ContainerIOS == util.Container || util.ContainerAndroid == util.Container || util.ContainerHarmony == util.Container {
770 // PDF asset content searching is not supported on mobile platforms
771 return
772 }
773
774 now := time.Now()
775 if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") {
776 return
777 }
778
779 if !gulu.File.IsExist(absPath) {
780 return
781 }
782
783 tmp := copyTempAsset(absPath)
784 if "" == tmp {
785 return
786 }
787 defer os.RemoveAll(tmp)
788
789 // PDF blob will be processed in-memory making sharing of PDF document data across worker goroutines possible
790 pdfData, err := os.ReadFile(tmp)
791 if err != nil {
792 logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
793 return
794 }
795
796 // initialize go-pdfium with number of available cores
797 // we fire up the complete worker pool for maximum performance
798 cores := runtime.NumCPU()
799 if 4 < cores {
800 cores = 4 // Limit memory usage
801 }
802
803 pool, err := webassembly.Init(webassembly.Config{
804 MinIdle: cores,
805 MaxIdle: cores,
806 MaxTotal: cores,
807 })
808 if err != nil {
809 logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
810 return
811 }
812 defer pool.Close()
813
814 // first get the number of PDF pages to convert into text
815 instance, err := pool.GetInstance(time.Second * 30)
816 if err != nil {
817 logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
818 return
819 }
820 doc, err := instance.OpenDocument(&requests.OpenDocument{
821 File: &pdfData,
822 })
823 if err != nil {
824 instance.Close()
825 logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
826 return
827 }
828 pc, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document})
829 if err != nil {
830 instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
831 Document: doc.Document,
832 })
833 instance.Close()
834 logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
835 return
836 }
837 instance.Close()
838
839 if PDFAssetContentMaxPage < pc.PageCount {
840 // PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053
841 logging.LogWarnf("ignore large PDF asset [%s] with [%d] pages", absPath, pc.PageCount)
842 return
843 }
844
845 if maxSizeVal := os.Getenv("SIYUAN_PDF_ASSET_CONTENT_INDEX_MAX_SIZE"); "" != maxSizeVal {
846 if maxSize, parseErr := strconv.ParseUint(maxSizeVal, 10, 64); nil == parseErr {
847 if maxSize != PDFAssetContentMaxSize {
848 PDFAssetContentMaxSize = maxSize
849 logging.LogInfof("set PDF asset content index max size to [%s]", humanize.BytesCustomCeil(maxSize, 2))
850 }
851 } else {
852 logging.LogWarnf("invalid env [SIYUAN_PDF_ASSET_CONTENT_INDEX_MAX_SIZE]: [%s], parsing failed: %s", maxSizeVal, parseErr)
853 }
854 }
855
856 if PDFAssetContentMaxSize < uint64(len(pdfData)) {
857 // PDF files larger than 128MB are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9500
858 logging.LogWarnf("ignore large PDF asset [%s] with [%s]", absPath, humanize.BytesCustomCeil(uint64(len(pdfData)), 2))
859 return
860 }
861
862 // next setup worker pool for processing PDF pages
863 pages := make(chan *pdfPage, pc.PageCount)
864 results := make(chan *pdfTextResult, pc.PageCount)
865 for i := 0; i < cores; i++ {
866 inst, err := pool.GetInstance(time.Second * 30)
867 if err != nil {
868 close(pages)
869 close(results)
870 logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
871 return
872 }
873 go parser.getTextPageWorker(i, inst, pages, results)
874 }
875
876 // now split pages and let them process by worker pool
877 for p := 0; p < pc.PageCount; p++ {
878 pages <- &pdfPage{
879 pageNo: p,
880 data: &pdfData,
881 }
882 }
883 close(pages)
884
885 // finally fetch the PDF page text results
886 // Note: some workers will process pages faster than other workers depending on the page contents
887 // the order of returned PDF text pages is random and must be sorted using the pageNo index
888 pageText := make([]string, pc.PageCount)
889 for p := 0; p < pc.PageCount; p++ {
890 res := <-results
891 pageText[res.pageNo] = res.text
892 if nil != res.err {
893 logging.LogErrorf("convert [%s] of page %d failed: [%s]", tmp, res.pageNo, res.err)
894 }
895 }
896 close(results)
897
898 if 128 < pc.PageCount {
899 logging.LogInfof("convert [%s] PDF with [%d] pages using [%d] workers took [%s]", absPath, pc.PageCount, cores, time.Since(now))
900 }
901
902 // loop through ordered PDF text pages and join content for asset parse DB result
903 contentBuilder := bytes.Buffer{}
904 for _, pt := range pageText {
905 contentBuilder.WriteString(" " + normalizeNonTxtAssetContent(pt))
906 }
907 ret = &AssetParseResult{
908 Content: contentBuilder.String(),
909 }
910 return
911}
912
913type EpubAssetParser struct {
914}
915
916func (parser *EpubAssetParser) Parse(absPath string) (ret *AssetParseResult) {
917 if !strings.HasSuffix(strings.ToLower(absPath), ".epub") {
918 return
919 }
920
921 if !gulu.File.IsExist(absPath) {
922 return
923 }
924
925 tmp := copyTempAsset(absPath)
926 if "" == tmp {
927 return
928 }
929 defer os.RemoveAll(tmp)
930
931 f, err := os.Open(tmp)
932 if err != nil {
933 logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
934 return
935 }
936 defer f.Close()
937
938 buf := bytes.Buffer{}
939 if err = epub.ToTxt(tmp, &buf); err != nil {
940 logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
941 return
942 }
943
944 content := normalizeNonTxtAssetContent(buf.String())
945 ret = &AssetParseResult{
946 Content: content,
947 }
948 return
949}