oh no indexer :(
This commit is contained in:
parent
bdf158b964
commit
c12593b0bf
1 changed files with 76 additions and 45 deletions
121
index/main.go
121
index/main.go
|
@ -4,6 +4,7 @@ import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"git.tdpain.net/codemicro/hn84/index/internal/config"
|
"git.tdpain.net/codemicro/hn84/index/internal/config"
|
||||||
"git.tdpain.net/codemicro/hn84/index/internal/database"
|
"git.tdpain.net/codemicro/hn84/index/internal/database"
|
||||||
"git.tdpain.net/codemicro/hn84/util"
|
"git.tdpain.net/codemicro/hn84/util"
|
||||||
|
@ -14,6 +15,7 @@ import (
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
@ -42,64 +44,93 @@ func walkDir(db *bun.DB, dir string) error {
|
||||||
return util.Wrap("read data dir", err)
|
return util.Wrap("read data dir", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, entry := range de {
|
fmt.Printf("%d entries in %s\n", len(de)/2, dir)
|
||||||
name := entry.Name()
|
defer fmt.Println()
|
||||||
if !strings.HasSuffix(name, "html") {
|
|
||||||
continue
|
jobs := make(chan os.DirEntry)
|
||||||
|
stop := new(sync.WaitGroup)
|
||||||
|
|
||||||
|
go worker(jobs, stop, db, dir)
|
||||||
|
go worker(jobs, stop, db, dir)
|
||||||
|
go worker(jobs, stop, db, dir)
|
||||||
|
go worker(jobs, stop, db, dir)
|
||||||
|
|
||||||
|
for i, entry := range de {
|
||||||
|
jobs <- entry
|
||||||
|
|
||||||
|
if i%100 == 0 {
|
||||||
|
fmt.Printf("%d %.0f%% \r", i, (float32(i)/float32(len(de)))*100)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
id := name[:len(name)-5]
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// Process tokens
|
func worker(jobs chan os.DirEntry, done *sync.WaitGroup, db *bun.DB, dir string) {
|
||||||
|
done.Add(1)
|
||||||
htmlContent, err := os.ReadFile(path.Join(dir, name))
|
for entry := range jobs {
|
||||||
if err != nil {
|
if err := processEntry(db, entry, dir); err != nil {
|
||||||
return util.Wrap("read HTML file", err)
|
slog.Error("entry process error", err)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
done.Done()
|
||||||
|
}
|
||||||
|
|
||||||
plaintext, pageTitle, err := convertHTMLToPlaintext(string(htmlContent))
|
func processEntry(db *bun.DB, entry os.DirEntry, dir string) error {
|
||||||
if err != nil {
|
name := entry.Name()
|
||||||
return util.Wrap("convert HTML to plaintext", err)
|
if !strings.HasSuffix(name, "html") {
|
||||||
}
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
plaintext = filterPlaintextCharacters(plaintext)
|
id := name[:len(name)-5]
|
||||||
tokens := tokenise(plaintext)
|
|
||||||
tokens = filterStopwords(tokens)
|
|
||||||
stemTokens(tokens)
|
|
||||||
|
|
||||||
dbTokens := convertToDatabaseTokens(tokens, id)
|
// Process tokens
|
||||||
if _, err := db.NewInsert().Model(&dbTokens).Exec(context.Background()); err != nil {
|
|
||||||
return util.Wrap("unable to insert tokens to database", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Dump plaintext to file
|
htmlContent, err := os.ReadFile(path.Join(dir, name))
|
||||||
if err := os.WriteFile(path.Join(dir, id+".txt"), []byte(plaintext), 0466); err != nil {
|
if err != nil {
|
||||||
return util.Wrap("write plaintext", err)
|
return util.Wrap("read HTML file", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read extra data
|
plaintext, pageTitle, err := convertHTMLToPlaintext(string(htmlContent))
|
||||||
var dat = struct {
|
if err != nil {
|
||||||
URL string
|
return util.Wrap("convert HTML to plaintext", err)
|
||||||
}{}
|
}
|
||||||
|
|
||||||
jsonBytes, err := os.ReadFile(path.Join(dir, id+".json"))
|
plaintext = filterPlaintextCharacters(plaintext)
|
||||||
if err != nil {
|
tokens := tokenise(plaintext)
|
||||||
return util.Wrap("read document info", err)
|
tokens = filterStopwords(tokens)
|
||||||
}
|
stemTokens(tokens)
|
||||||
|
|
||||||
if err := json.Unmarshal(jsonBytes, &dat); err != nil {
|
dbTokens := convertToDatabaseTokens(tokens, id)
|
||||||
return util.Wrap("unmarshal document info", err)
|
if _, err := db.NewInsert().Model(&dbTokens).Exec(context.Background()); err != nil {
|
||||||
}
|
return util.Wrap("unable to insert tokens to database", err)
|
||||||
|
}
|
||||||
|
|
||||||
if _, err := db.NewInsert().Model(&database.Document{
|
// Dump plaintext to file
|
||||||
ID: id,
|
if err := os.WriteFile(path.Join(dir, id+".txt"), []byte(plaintext), 0466); err != nil {
|
||||||
URL: dat.URL,
|
return util.Wrap("write plaintext", err)
|
||||||
Title: pageTitle,
|
}
|
||||||
}).Exec(context.Background()); err != nil {
|
|
||||||
return util.Wrap("insert document to database", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
break
|
// Read extra data
|
||||||
|
var dat = struct {
|
||||||
|
URL string
|
||||||
|
}{}
|
||||||
|
|
||||||
|
jsonBytes, err := os.ReadFile(path.Join(dir, id+".json"))
|
||||||
|
if err != nil {
|
||||||
|
return util.Wrap("read document info", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := json.Unmarshal(jsonBytes, &dat); err != nil {
|
||||||
|
return util.Wrap("unmarshal document info", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := db.NewInsert().Model(&database.Document{
|
||||||
|
ID: id,
|
||||||
|
URL: dat.URL,
|
||||||
|
Title: pageTitle,
|
||||||
|
}).Exec(context.Background()); err != nil {
|
||||||
|
return util.Wrap("insert document to database", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
|
Reference in a new issue