I wish to scream (yay indexer)

This commit is contained in:
akp 2023-11-05 02:15:05 +00:00
parent 2ee0404ee8
commit 22671ea8cd
No known key found for this signature in database
GPG key ID: CF8D58F3DEB20755
6 changed files with 351 additions and 23 deletions

View file

@ -34,7 +34,7 @@ func Get() *Config {
NumWorkers: cl.WithDefault("crawler.numWorkers", 8).AsInt(),
MaxPagesPerDomain: cl.WithDefault("crawler.maxPagesPerDomain", 300).AsInt(),
UserAgent: cl.WithDefault("crawler.userAgent", "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0").AsString(),
CrawlDataDir: cl.WithDefault("crawler.dataDir", "crawlData").AsString(),
CrawlDataDir: cl.WithDefault("dataDir", "crawlData").AsString(),
}
})

24
go.mod
View file

@ -3,20 +3,22 @@ module git.tdpain.net/codemicro/hn84
go 1.21.1
require (
git.tdpain.net/pkg/cfger v0.1.0 // indirect
github.com/PuerkitoBio/goquery v1.8.1 // indirect
git.tdpain.net/pkg/cfger v0.1.0
github.com/PuerkitoBio/goquery v1.8.1
github.com/bwmarrin/snowflake v0.3.0
github.com/carlmjohnson/requests v0.23.5
github.com/mattn/go-sqlite3 v1.14.17
github.com/uptrace/bun v1.1.16
github.com/uptrace/bun/dialect/sqlitedialect v1.1.16
github.com/zentures/porter2 v0.0.0-20150829210152-56e4718818e8
)
require (
github.com/andybalholm/cascadia v1.3.1 // indirect
github.com/bwmarrin/snowflake v0.3.0 // indirect
github.com/carlmjohnson/requests v0.23.5 // indirect
github.com/fatih/color v1.15.0 // indirect
github.com/jinzhu/inflection v1.0.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.19 // indirect
github.com/mattn/go-sqlite3 v1.14.17 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/surge/glog v0.0.0-20141108051140-2578deb2b95c // indirect
github.com/tmthrgd/go-hex v0.0.0-20190904060850-447a3041c3bc // indirect
github.com/uptrace/bun v1.1.16 // indirect
github.com/uptrace/bun/dialect/sqlitedialect v1.1.16 // indirect
github.com/uptrace/bun/extra/bundebug v1.1.16 // indirect
github.com/vmihailenco/msgpack/v5 v5.3.5 // indirect
github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect
golang.org/x/net v0.15.0 // indirect

27
go.sum
View file

@ -8,34 +8,39 @@ github.com/bwmarrin/snowflake v0.3.0 h1:xm67bEhkKh6ij1790JB83OujPR5CzNe8QuQqAgIS
github.com/bwmarrin/snowflake v0.3.0/go.mod h1:NdZxfVWX+oR6y2K0o6qAYv6gIOP9rjG0/E9WsDpxqwE=
github.com/carlmjohnson/requests v0.23.5 h1:NPANcAofwwSuC6SIMwlgmHry2V3pLrSqRiSBKYbNHHA=
github.com/carlmjohnson/requests v0.23.5/go.mod h1:zG9P28thdRnN61aD7iECFhH5iGGKX2jIjKQD9kqYH+o=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs=
github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E=
github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/mattn/go-sqlite3 v1.14.17 h1:mCRHCLDUBXgpKAqIKsaAaAsrAlbkeomtRFKXh2L6YIM=
github.com/mattn/go-sqlite3 v1.14.17/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/surge/glog v0.0.0-20141108051140-2578deb2b95c h1:cVA8Fd14+bmcDyVutgf976DrV9RzNO4SMzUQmfJDMrw=
github.com/surge/glog v0.0.0-20141108051140-2578deb2b95c/go.mod h1:W6gI0HQAbNyEO/62hesTBIbabSGJaEdlUApLw8UtuB0=
github.com/tmthrgd/go-hex v0.0.0-20190904060850-447a3041c3bc h1:9lRDQMhESg+zvGYmW5DyG0UqvY96Bu5QYsTLvCHdrgo=
github.com/tmthrgd/go-hex v0.0.0-20190904060850-447a3041c3bc/go.mod h1:bciPuU6GHm1iF1pBvUfxfsH0Wmnc2VbpgvbI9ZWuIRs=
github.com/uptrace/bun v1.1.16 h1:cn9cgEMFwcyYRsQLfxCRMUxyK1WaHwOVrR3TvzEFZ/A=
github.com/uptrace/bun v1.1.16/go.mod h1:7HnsMRRvpLFUcquJxp22JO8PsWKpFQO/gNXqqsuGWg8=
github.com/uptrace/bun/dialect/sqlitedialect v1.1.16 h1:gbc9BP/e4sNOB9VBj+Si46dpOz2oktmZPidkda92GYY=
github.com/uptrace/bun/dialect/sqlitedialect v1.1.16/go.mod h1:YNezpK7fIn5Wa2WGmTCZ/nEyiswcXmuT4iNWADeL1x4=
github.com/uptrace/bun/extra/bundebug v1.1.16 h1:SgicRQGtnjhrIhlYOxdkOm1Em4s6HykmT3JblHnoTBM=
github.com/uptrace/bun/extra/bundebug v1.1.16/go.mod h1:SkiOkfUirBiO1Htc4s5bQKEq+JSeU1TkBVpMsPz2ePM=
github.com/vmihailenco/msgpack/v5 v5.3.5 h1:5gO0H1iULLWGhs2H5tbAHIZTV8/cYafcFOr9znI5mJU=
github.com/vmihailenco/msgpack/v5 v5.3.5/go.mod h1:7xyJ9e+0+9SaZT0Wt1RGleJXzli6Q/V5KbhBonMG9jc=
github.com/vmihailenco/tagparser/v2 v2.0.0 h1:y09buUbR+b5aycVFQs/g70pqKVZNBmxwAhO7/IwNM9g=
github.com/vmihailenco/tagparser/v2 v2.0.0/go.mod h1:Wri+At7QHww0WTrCBeu4J6bNtoV6mEfg5OIWRZA9qds=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
github.com/zentures/porter2 v0.0.0-20150829210152-56e4718818e8 h1:2YZN1WHQKIfOweOtevz5t5PiKzPf5nFGdezH1EtAPeM=
github.com/zentures/porter2 v0.0.0-20150829210152-56e4718818e8/go.mod h1:DjHUE1+g0AecmviVQYFvsOHlA95D/Rs1mnPK8f81wu4=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
@ -54,9 +59,7 @@ golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
@ -72,6 +75,8 @@ golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtn
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU=
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View file

@ -0,0 +1,41 @@
package config
import (
"git.tdpain.net/pkg/cfger"
"log/slog"
"os"
"sync"
)
type Config struct {
DatabaseName string
CrawlDataDir string
}
var (
conf *Config
loadOnce = new(sync.Once)
)
func Get() *Config {
var outerErr error
loadOnce.Do(func() {
cl := cfger.New()
if err := cl.Load("config.yml"); err != nil {
outerErr = err
return
}
conf = &Config{
DatabaseName: cl.WithDefault("index.databaseName", "index.db").AsString(),
CrawlDataDir: cl.WithDefault("dataDir", "crawlData").AsString(),
}
})
if outerErr != nil {
slog.Error("fatal error when loading configuration", "err", outerErr)
os.Exit(1)
}
return conf
}

View file

@ -0,0 +1,60 @@
package database
import (
"context"
"database/sql"
"errors"
"git.tdpain.net/codemicro/hn84/util"
_ "github.com/mattn/go-sqlite3"
"github.com/uptrace/bun"
"github.com/uptrace/bun/dialect/sqlitedialect"
"os"
)
type Document struct {
bun.BaseModel
ID string `bun:",pk"`
URL string
Title string
}
type Token struct {
bun.BaseModel
Token string
DocumentID string
Start, End int
}
func Setup(filepath string) (*bun.DB, error) {
alreadyExists := true
if _, err := os.Stat(filepath); err != nil {
if !errors.Is(err, os.ErrNotExist) {
return nil, err
}
alreadyExists = false
}
db, err := sql.Open("sqlite3", filepath)
if err != nil {
return nil, util.Wrap("open database", err)
}
db.SetMaxOpenConns(1) // https://github.com/mattn/go-sqlite3/issues/274#issuecomment-191597862
b := bun.NewDB(db, sqlitedialect.New())
//b.AddQueryHook(bundebug.NewQueryHook(bundebug.WithVerbose(true)))
if !alreadyExists {
if _, err := b.NewCreateTable().Model((*Document)(nil)).Exec(context.Background()); err != nil {
return nil, util.Wrap("create Document table", err)
}
if _, err := b.NewCreateTable().Model((*Token)(nil)).Exec(context.Background()); err != nil {
return nil, util.Wrap("create Token table", err)
}
}
return b, nil
}

220
index/main.go Normal file
View file

@ -0,0 +1,220 @@
package main
import (
"bytes"
"context"
"encoding/json"
"git.tdpain.net/codemicro/hn84/index/internal/config"
"git.tdpain.net/codemicro/hn84/index/internal/database"
"git.tdpain.net/codemicro/hn84/util"
"github.com/PuerkitoBio/goquery"
"github.com/uptrace/bun"
"github.com/zentures/porter2"
"log/slog"
"os"
"path"
"strings"
)
func main() {
if err := run(); err != nil {
slog.Error("unrecoverable runtime error", "error", err)
os.Exit(1)
}
}
func run() error {
db, err := database.Setup(config.Get().DatabaseName)
if err != nil {
return util.Wrap("setup database", err)
}
if err := walkDir(db, config.Get().CrawlDataDir); err != nil {
return err
}
return nil
}
func walkDir(db *bun.DB, dir string) error {
de, err := os.ReadDir(dir)
if err != nil {
return util.Wrap("read data dir", err)
}
for _, entry := range de {
name := entry.Name()
if !strings.HasSuffix(name, "html") {
continue
}
id := name[:len(name)-5]
// Process tokens
htmlContent, err := os.ReadFile(path.Join(dir, name))
if err != nil {
return util.Wrap("read HTML file", err)
}
plaintext, pageTitle, err := convertHTMLToPlaintext(string(htmlContent))
if err != nil {
return util.Wrap("convert HTML to plaintext", err)
}
plaintext = filterPlaintextCharacters(plaintext)
tokens := tokenise(plaintext)
tokens = filterStopwords(tokens)
stemTokens(tokens)
dbTokens := convertToDatabaseTokens(tokens, id)
if _, err := db.NewInsert().Model(&dbTokens).Exec(context.Background()); err != nil {
return util.Wrap("unable to insert tokens to database", err)
}
// Dump plaintext to file
if err := os.WriteFile(path.Join(dir, id+".txt"), []byte(plaintext), 0466); err != nil {
return util.Wrap("write plaintext", err)
}
// Read extra data
var dat = struct {
URL string
}{}
jsonBytes, err := os.ReadFile(path.Join(dir, id+".json"))
if err != nil {
return util.Wrap("read document info", err)
}
if err := json.Unmarshal(jsonBytes, &dat); err != nil {
return util.Wrap("unmarshal document info", err)
}
if _, err := db.NewInsert().Model(&database.Document{
ID: id,
URL: dat.URL,
Title: pageTitle,
}).Exec(context.Background()); err != nil {
return util.Wrap("insert document to database", err)
}
break
}
return nil
}
type intermediateToken struct {
Val string
Start, End int
}
func convertHTMLToPlaintext(htmlStr string) (string, string, error) {
doc, err := goquery.NewDocumentFromReader(bytes.NewBufferString(htmlStr))
if err != nil {
return "", "", util.Wrap("load HTML into goquery", err)
}
var titleStr string
title := doc.Find("title")
if len(title.Nodes) != 0 {
titleStr = strings.TrimSpace(title.Text())
}
return titleStr + " " + strings.TrimSpace(doc.Find("body").Text()), titleStr, nil
}
func tokenise(plaintext string) []*intermediateToken {
previousSpace := -1
var tok []*intermediateToken
pln := len(plaintext)
for i, char := range plaintext {
if char == ' ' || i == pln-1 {
end := i - 1
if char != ' ' {
end += 1
i += 1
}
tok = append(tok, &intermediateToken{
Val: strings.ToLower(plaintext[previousSpace+1 : i]),
Start: previousSpace + 1,
End: end,
})
previousSpace = i
}
}
return tok
}
func filterStopwords(tokens []*intermediateToken) []*intermediateToken {
n := 0
for _, tok := range tokens {
_, found := stopwords[tok.Val]
if !found {
tokens[n] = tok
n += 1
}
}
return tokens[:n]
}
func stemTokens(tokens []*intermediateToken) {
for _, tok := range tokens {
tok.Val = porter2.Stem(tok.Val)
}
}
func convertToDatabaseTokens(tokens []*intermediateToken, documentID string) []*database.Token {
var res []*database.Token
for _, tok := range tokens {
res = append(res, &database.Token{
Token: tok.Val,
DocumentID: documentID,
Start: tok.Start,
End: tok.End,
})
}
return res
}
func filterPlaintextCharacters(plaintext string) string {
arr := []rune(plaintext)
n := 0
for _, char := range arr {
if ('A' <= char && char <= 'Z') || ('a' <= char && char <= 'z') || char == ' ' || ('0' <= char && char <= '9') {
arr[n] = char
n += 1
}
}
return strings.Join(strings.Fields(string(arr[:n])), " ")
}
var stopwords = map[string]struct{}{
"the": {},
"be": {},
"to": {},
"of": {},
"and": {},
"a": {},
"in": {},
"that": {},
"have": {},
"I": {},
"it": {},
"for": {},
"not": {},
"on": {},
"with": {},
"he": {},
"as": {},
"you": {},
"do": {},
"at": {},
"this": {},
"but": {},
"his": {},
"by": {},
"from": {},
}