diff --git a/crawl/internal/config/config.go b/crawl/internal/config/config.go index 4768417..b10d0a9 100644 --- a/crawl/internal/config/config.go +++ b/crawl/internal/config/config.go @@ -34,7 +34,7 @@ func Get() *Config { NumWorkers: cl.WithDefault("crawler.numWorkers", 8).AsInt(), MaxPagesPerDomain: cl.WithDefault("crawler.maxPagesPerDomain", 300).AsInt(), UserAgent: cl.WithDefault("crawler.userAgent", "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0").AsString(), - CrawlDataDir: cl.WithDefault("crawler.dataDir", "crawlData").AsString(), + CrawlDataDir: cl.WithDefault("dataDir", "crawlData").AsString(), } }) diff --git a/go.mod b/go.mod index 8d39d99..4634570 100644 --- a/go.mod +++ b/go.mod @@ -3,20 +3,22 @@ module git.tdpain.net/codemicro/hn84 go 1.21.1 require ( - git.tdpain.net/pkg/cfger v0.1.0 // indirect - github.com/PuerkitoBio/goquery v1.8.1 // indirect + git.tdpain.net/pkg/cfger v0.1.0 + github.com/PuerkitoBio/goquery v1.8.1 + github.com/bwmarrin/snowflake v0.3.0 + github.com/carlmjohnson/requests v0.23.5 + github.com/mattn/go-sqlite3 v1.14.17 + github.com/uptrace/bun v1.1.16 + github.com/uptrace/bun/dialect/sqlitedialect v1.1.16 + github.com/zentures/porter2 v0.0.0-20150829210152-56e4718818e8 +) + +require ( github.com/andybalholm/cascadia v1.3.1 // indirect - github.com/bwmarrin/snowflake v0.3.0 // indirect - github.com/carlmjohnson/requests v0.23.5 // indirect - github.com/fatih/color v1.15.0 // indirect github.com/jinzhu/inflection v1.0.0 // indirect - github.com/mattn/go-colorable v0.1.13 // indirect - github.com/mattn/go-isatty v0.0.19 // indirect - github.com/mattn/go-sqlite3 v1.14.17 // indirect + github.com/kr/text v0.2.0 // indirect + github.com/surge/glog v0.0.0-20141108051140-2578deb2b95c // indirect github.com/tmthrgd/go-hex v0.0.0-20190904060850-447a3041c3bc // indirect - github.com/uptrace/bun v1.1.16 // indirect - github.com/uptrace/bun/dialect/sqlitedialect v1.1.16 // indirect - github.com/uptrace/bun/extra/bundebug v1.1.16 // indirect github.com/vmihailenco/msgpack/v5 v5.3.5 // indirect github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect golang.org/x/net v0.15.0 // indirect diff --git a/go.sum b/go.sum index b7910a1..0e6f5c3 100644 --- a/go.sum +++ b/go.sum @@ -8,34 +8,39 @@ github.com/bwmarrin/snowflake v0.3.0 h1:xm67bEhkKh6ij1790JB83OujPR5CzNe8QuQqAgIS github.com/bwmarrin/snowflake v0.3.0/go.mod h1:NdZxfVWX+oR6y2K0o6qAYv6gIOP9rjG0/E9WsDpxqwE= github.com/carlmjohnson/requests v0.23.5 h1:NPANcAofwwSuC6SIMwlgmHry2V3pLrSqRiSBKYbNHHA= github.com/carlmjohnson/requests v0.23.5/go.mod h1:zG9P28thdRnN61aD7iECFhH5iGGKX2jIjKQD9kqYH+o= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs= -github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E= github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc= -github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= -github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= -github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= -github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= -github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mattn/go-sqlite3 v1.14.17 h1:mCRHCLDUBXgpKAqIKsaAaAsrAlbkeomtRFKXh2L6YIM= github.com/mattn/go-sqlite3 v1.14.17/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/surge/glog v0.0.0-20141108051140-2578deb2b95c h1:cVA8Fd14+bmcDyVutgf976DrV9RzNO4SMzUQmfJDMrw= +github.com/surge/glog v0.0.0-20141108051140-2578deb2b95c/go.mod h1:W6gI0HQAbNyEO/62hesTBIbabSGJaEdlUApLw8UtuB0= github.com/tmthrgd/go-hex v0.0.0-20190904060850-447a3041c3bc h1:9lRDQMhESg+zvGYmW5DyG0UqvY96Bu5QYsTLvCHdrgo= github.com/tmthrgd/go-hex v0.0.0-20190904060850-447a3041c3bc/go.mod h1:bciPuU6GHm1iF1pBvUfxfsH0Wmnc2VbpgvbI9ZWuIRs= github.com/uptrace/bun v1.1.16 h1:cn9cgEMFwcyYRsQLfxCRMUxyK1WaHwOVrR3TvzEFZ/A= github.com/uptrace/bun v1.1.16/go.mod h1:7HnsMRRvpLFUcquJxp22JO8PsWKpFQO/gNXqqsuGWg8= github.com/uptrace/bun/dialect/sqlitedialect v1.1.16 h1:gbc9BP/e4sNOB9VBj+Si46dpOz2oktmZPidkda92GYY= github.com/uptrace/bun/dialect/sqlitedialect v1.1.16/go.mod h1:YNezpK7fIn5Wa2WGmTCZ/nEyiswcXmuT4iNWADeL1x4= -github.com/uptrace/bun/extra/bundebug v1.1.16 h1:SgicRQGtnjhrIhlYOxdkOm1Em4s6HykmT3JblHnoTBM= -github.com/uptrace/bun/extra/bundebug v1.1.16/go.mod h1:SkiOkfUirBiO1Htc4s5bQKEq+JSeU1TkBVpMsPz2ePM= github.com/vmihailenco/msgpack/v5 v5.3.5 h1:5gO0H1iULLWGhs2H5tbAHIZTV8/cYafcFOr9znI5mJU= github.com/vmihailenco/msgpack/v5 v5.3.5/go.mod h1:7xyJ9e+0+9SaZT0Wt1RGleJXzli6Q/V5KbhBonMG9jc= github.com/vmihailenco/tagparser/v2 v2.0.0 h1:y09buUbR+b5aycVFQs/g70pqKVZNBmxwAhO7/IwNM9g= github.com/vmihailenco/tagparser/v2 v2.0.0/go.mod h1:Wri+At7QHww0WTrCBeu4J6bNtoV6mEfg5OIWRZA9qds= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +github.com/zentures/porter2 v0.0.0-20150829210152-56e4718818e8 h1:2YZN1WHQKIfOweOtevz5t5PiKzPf5nFGdezH1EtAPeM= +github.com/zentures/porter2 v0.0.0-20150829210152-56e4718818e8/go.mod h1:DjHUE1+g0AecmviVQYFvsOHlA95D/Rs1mnPK8f81wu4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= @@ -54,9 +59,7 @@ golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= @@ -72,6 +75,8 @@ golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/index/internal/config/config.go b/index/internal/config/config.go new file mode 100644 index 0000000..b8d73c0 --- /dev/null +++ b/index/internal/config/config.go @@ -0,0 +1,41 @@ +package config + +import ( + "git.tdpain.net/pkg/cfger" + "log/slog" + "os" + "sync" +) + +type Config struct { + DatabaseName string + CrawlDataDir string +} + +var ( + conf *Config + loadOnce = new(sync.Once) +) + +func Get() *Config { + var outerErr error + loadOnce.Do(func() { + cl := cfger.New() + if err := cl.Load("config.yml"); err != nil { + outerErr = err + return + } + + conf = &Config{ + DatabaseName: cl.WithDefault("index.databaseName", "index.db").AsString(), + CrawlDataDir: cl.WithDefault("dataDir", "crawlData").AsString(), + } + }) + + if outerErr != nil { + slog.Error("fatal error when loading configuration", "err", outerErr) + os.Exit(1) + } + + return conf +} diff --git a/index/internal/database/db.go b/index/internal/database/db.go new file mode 100644 index 0000000..821dac2 --- /dev/null +++ b/index/internal/database/db.go @@ -0,0 +1,60 @@ +package database + +import ( + "context" + "database/sql" + "errors" + "git.tdpain.net/codemicro/hn84/util" + _ "github.com/mattn/go-sqlite3" + "github.com/uptrace/bun" + "github.com/uptrace/bun/dialect/sqlitedialect" + "os" +) + +type Document struct { + bun.BaseModel + + ID string `bun:",pk"` + URL string + Title string +} + +type Token struct { + bun.BaseModel + + Token string + DocumentID string + Start, End int +} + +func Setup(filepath string) (*bun.DB, error) { + alreadyExists := true + if _, err := os.Stat(filepath); err != nil { + if !errors.Is(err, os.ErrNotExist) { + return nil, err + } + alreadyExists = false + } + + db, err := sql.Open("sqlite3", filepath) + if err != nil { + return nil, util.Wrap("open database", err) + } + + db.SetMaxOpenConns(1) // https://github.com/mattn/go-sqlite3/issues/274#issuecomment-191597862 + + b := bun.NewDB(db, sqlitedialect.New()) + //b.AddQueryHook(bundebug.NewQueryHook(bundebug.WithVerbose(true))) + + if !alreadyExists { + if _, err := b.NewCreateTable().Model((*Document)(nil)).Exec(context.Background()); err != nil { + return nil, util.Wrap("create Document table", err) + } + + if _, err := b.NewCreateTable().Model((*Token)(nil)).Exec(context.Background()); err != nil { + return nil, util.Wrap("create Token table", err) + } + } + + return b, nil +} diff --git a/index/main.go b/index/main.go new file mode 100644 index 0000000..56cbfa6 --- /dev/null +++ b/index/main.go @@ -0,0 +1,220 @@ +package main + +import ( + "bytes" + "context" + "encoding/json" + "git.tdpain.net/codemicro/hn84/index/internal/config" + "git.tdpain.net/codemicro/hn84/index/internal/database" + "git.tdpain.net/codemicro/hn84/util" + "github.com/PuerkitoBio/goquery" + "github.com/uptrace/bun" + "github.com/zentures/porter2" + "log/slog" + "os" + "path" + "strings" +) + +func main() { + if err := run(); err != nil { + slog.Error("unrecoverable runtime error", "error", err) + os.Exit(1) + } +} + +func run() error { + db, err := database.Setup(config.Get().DatabaseName) + if err != nil { + return util.Wrap("setup database", err) + } + + if err := walkDir(db, config.Get().CrawlDataDir); err != nil { + return err + } + + return nil +} + +func walkDir(db *bun.DB, dir string) error { + de, err := os.ReadDir(dir) + if err != nil { + return util.Wrap("read data dir", err) + } + + for _, entry := range de { + name := entry.Name() + if !strings.HasSuffix(name, "html") { + continue + } + + id := name[:len(name)-5] + + // Process tokens + + htmlContent, err := os.ReadFile(path.Join(dir, name)) + if err != nil { + return util.Wrap("read HTML file", err) + } + + plaintext, pageTitle, err := convertHTMLToPlaintext(string(htmlContent)) + if err != nil { + return util.Wrap("convert HTML to plaintext", err) + } + + plaintext = filterPlaintextCharacters(plaintext) + tokens := tokenise(plaintext) + tokens = filterStopwords(tokens) + stemTokens(tokens) + + dbTokens := convertToDatabaseTokens(tokens, id) + if _, err := db.NewInsert().Model(&dbTokens).Exec(context.Background()); err != nil { + return util.Wrap("unable to insert tokens to database", err) + } + + // Dump plaintext to file + if err := os.WriteFile(path.Join(dir, id+".txt"), []byte(plaintext), 0466); err != nil { + return util.Wrap("write plaintext", err) + } + + // Read extra data + var dat = struct { + URL string + }{} + + jsonBytes, err := os.ReadFile(path.Join(dir, id+".json")) + if err != nil { + return util.Wrap("read document info", err) + } + + if err := json.Unmarshal(jsonBytes, &dat); err != nil { + return util.Wrap("unmarshal document info", err) + } + + if _, err := db.NewInsert().Model(&database.Document{ + ID: id, + URL: dat.URL, + Title: pageTitle, + }).Exec(context.Background()); err != nil { + return util.Wrap("insert document to database", err) + } + + break + } + + return nil +} + +type intermediateToken struct { + Val string + Start, End int +} + +func convertHTMLToPlaintext(htmlStr string) (string, string, error) { + doc, err := goquery.NewDocumentFromReader(bytes.NewBufferString(htmlStr)) + if err != nil { + return "", "", util.Wrap("load HTML into goquery", err) + } + + var titleStr string + title := doc.Find("title") + if len(title.Nodes) != 0 { + titleStr = strings.TrimSpace(title.Text()) + } + + return titleStr + " " + strings.TrimSpace(doc.Find("body").Text()), titleStr, nil +} + +func tokenise(plaintext string) []*intermediateToken { + previousSpace := -1 + var tok []*intermediateToken + pln := len(plaintext) + for i, char := range plaintext { + if char == ' ' || i == pln-1 { + end := i - 1 + if char != ' ' { + end += 1 + i += 1 + } + tok = append(tok, &intermediateToken{ + Val: strings.ToLower(plaintext[previousSpace+1 : i]), + Start: previousSpace + 1, + End: end, + }) + previousSpace = i + } + } + return tok +} + +func filterStopwords(tokens []*intermediateToken) []*intermediateToken { + n := 0 + for _, tok := range tokens { + _, found := stopwords[tok.Val] + if !found { + tokens[n] = tok + n += 1 + } + } + return tokens[:n] +} + +func stemTokens(tokens []*intermediateToken) { + for _, tok := range tokens { + tok.Val = porter2.Stem(tok.Val) + } +} + +func convertToDatabaseTokens(tokens []*intermediateToken, documentID string) []*database.Token { + var res []*database.Token + for _, tok := range tokens { + res = append(res, &database.Token{ + Token: tok.Val, + DocumentID: documentID, + Start: tok.Start, + End: tok.End, + }) + } + return res +} + +func filterPlaintextCharacters(plaintext string) string { + arr := []rune(plaintext) + n := 0 + for _, char := range arr { + if ('A' <= char && char <= 'Z') || ('a' <= char && char <= 'z') || char == ' ' || ('0' <= char && char <= '9') { + arr[n] = char + n += 1 + } + } + + return strings.Join(strings.Fields(string(arr[:n])), " ") +} + +var stopwords = map[string]struct{}{ + "the": {}, + "be": {}, + "to": {}, + "of": {}, + "and": {}, + "a": {}, + "in": {}, + "that": {}, + "have": {}, + "I": {}, + "it": {}, + "for": {}, + "not": {}, + "on": {}, + "with": {}, + "he": {}, + "as": {}, + "you": {}, + "do": {}, + "at": {}, + "this": {}, + "but": {}, + "his": {}, + "by": {}, + "from": {}, +}