oh no indexer :(

This commit is contained in:
akp 2023-11-05 07:18:24 +00:00
parent bdf158b964
commit c12593b0bf
No known key found for this signature in database
GPG key ID: CF8D58F3DEB20755

View file

@ -4,6 +4,7 @@ import (
"bytes" "bytes"
"context" "context"
"encoding/json" "encoding/json"
"fmt"
"git.tdpain.net/codemicro/hn84/index/internal/config" "git.tdpain.net/codemicro/hn84/index/internal/config"
"git.tdpain.net/codemicro/hn84/index/internal/database" "git.tdpain.net/codemicro/hn84/index/internal/database"
"git.tdpain.net/codemicro/hn84/util" "git.tdpain.net/codemicro/hn84/util"
@ -14,6 +15,7 @@ import (
"os" "os"
"path" "path"
"strings" "strings"
"sync"
) )
func main() { func main() {
@ -42,64 +44,93 @@ func walkDir(db *bun.DB, dir string) error {
return util.Wrap("read data dir", err) return util.Wrap("read data dir", err)
} }
for _, entry := range de { fmt.Printf("%d entries in %s\n", len(de)/2, dir)
name := entry.Name() defer fmt.Println()
if !strings.HasSuffix(name, "html") {
continue jobs := make(chan os.DirEntry)
stop := new(sync.WaitGroup)
go worker(jobs, stop, db, dir)
go worker(jobs, stop, db, dir)
go worker(jobs, stop, db, dir)
go worker(jobs, stop, db, dir)
for i, entry := range de {
jobs <- entry
if i%100 == 0 {
fmt.Printf("%d %.0f%% \r", i, (float32(i)/float32(len(de)))*100)
} }
}
id := name[:len(name)-5] return nil
}
// Process tokens func worker(jobs chan os.DirEntry, done *sync.WaitGroup, db *bun.DB, dir string) {
done.Add(1)
htmlContent, err := os.ReadFile(path.Join(dir, name)) for entry := range jobs {
if err != nil { if err := processEntry(db, entry, dir); err != nil {
return util.Wrap("read HTML file", err) slog.Error("entry process error", err)
} }
}
done.Done()
}
plaintext, pageTitle, err := convertHTMLToPlaintext(string(htmlContent)) func processEntry(db *bun.DB, entry os.DirEntry, dir string) error {
if err != nil { name := entry.Name()
return util.Wrap("convert HTML to plaintext", err) if !strings.HasSuffix(name, "html") {
} return nil
}
plaintext = filterPlaintextCharacters(plaintext) id := name[:len(name)-5]
tokens := tokenise(plaintext)
tokens = filterStopwords(tokens)
stemTokens(tokens)
dbTokens := convertToDatabaseTokens(tokens, id) // Process tokens
if _, err := db.NewInsert().Model(&dbTokens).Exec(context.Background()); err != nil {
return util.Wrap("unable to insert tokens to database", err)
}
// Dump plaintext to file htmlContent, err := os.ReadFile(path.Join(dir, name))
if err := os.WriteFile(path.Join(dir, id+".txt"), []byte(plaintext), 0466); err != nil { if err != nil {
return util.Wrap("write plaintext", err) return util.Wrap("read HTML file", err)
} }
// Read extra data plaintext, pageTitle, err := convertHTMLToPlaintext(string(htmlContent))
var dat = struct { if err != nil {
URL string return util.Wrap("convert HTML to plaintext", err)
}{} }
jsonBytes, err := os.ReadFile(path.Join(dir, id+".json")) plaintext = filterPlaintextCharacters(plaintext)
if err != nil { tokens := tokenise(plaintext)
return util.Wrap("read document info", err) tokens = filterStopwords(tokens)
} stemTokens(tokens)
if err := json.Unmarshal(jsonBytes, &dat); err != nil { dbTokens := convertToDatabaseTokens(tokens, id)
return util.Wrap("unmarshal document info", err) if _, err := db.NewInsert().Model(&dbTokens).Exec(context.Background()); err != nil {
} return util.Wrap("unable to insert tokens to database", err)
}
if _, err := db.NewInsert().Model(&database.Document{ // Dump plaintext to file
ID: id, if err := os.WriteFile(path.Join(dir, id+".txt"), []byte(plaintext), 0466); err != nil {
URL: dat.URL, return util.Wrap("write plaintext", err)
Title: pageTitle, }
}).Exec(context.Background()); err != nil {
return util.Wrap("insert document to database", err)
}
break // Read extra data
var dat = struct {
URL string
}{}
jsonBytes, err := os.ReadFile(path.Join(dir, id+".json"))
if err != nil {
return util.Wrap("read document info", err)
}
if err := json.Unmarshal(jsonBytes, &dat); err != nil {
return util.Wrap("unmarshal document info", err)
}
if _, err := db.NewInsert().Model(&database.Document{
ID: id,
URL: dat.URL,
Title: pageTitle,
}).Exec(context.Background()); err != nil {
return util.Wrap("insert document to database", err)
} }
return nil return nil