I wish to scream (yay indexer)
This commit is contained in:
parent
2ee0404ee8
commit
22671ea8cd
6 changed files with 351 additions and 23 deletions
|
@ -34,7 +34,7 @@ func Get() *Config {
|
|||
NumWorkers: cl.WithDefault("crawler.numWorkers", 8).AsInt(),
|
||||
MaxPagesPerDomain: cl.WithDefault("crawler.maxPagesPerDomain", 300).AsInt(),
|
||||
UserAgent: cl.WithDefault("crawler.userAgent", "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0").AsString(),
|
||||
CrawlDataDir: cl.WithDefault("crawler.dataDir", "crawlData").AsString(),
|
||||
CrawlDataDir: cl.WithDefault("dataDir", "crawlData").AsString(),
|
||||
}
|
||||
})
|
||||
|
||||
|
|
24
go.mod
24
go.mod
|
@ -3,20 +3,22 @@ module git.tdpain.net/codemicro/hn84
|
|||
go 1.21.1
|
||||
|
||||
require (
|
||||
git.tdpain.net/pkg/cfger v0.1.0 // indirect
|
||||
github.com/PuerkitoBio/goquery v1.8.1 // indirect
|
||||
git.tdpain.net/pkg/cfger v0.1.0
|
||||
github.com/PuerkitoBio/goquery v1.8.1
|
||||
github.com/bwmarrin/snowflake v0.3.0
|
||||
github.com/carlmjohnson/requests v0.23.5
|
||||
github.com/mattn/go-sqlite3 v1.14.17
|
||||
github.com/uptrace/bun v1.1.16
|
||||
github.com/uptrace/bun/dialect/sqlitedialect v1.1.16
|
||||
github.com/zentures/porter2 v0.0.0-20150829210152-56e4718818e8
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/andybalholm/cascadia v1.3.1 // indirect
|
||||
github.com/bwmarrin/snowflake v0.3.0 // indirect
|
||||
github.com/carlmjohnson/requests v0.23.5 // indirect
|
||||
github.com/fatih/color v1.15.0 // indirect
|
||||
github.com/jinzhu/inflection v1.0.0 // indirect
|
||||
github.com/mattn/go-colorable v0.1.13 // indirect
|
||||
github.com/mattn/go-isatty v0.0.19 // indirect
|
||||
github.com/mattn/go-sqlite3 v1.14.17 // indirect
|
||||
github.com/kr/text v0.2.0 // indirect
|
||||
github.com/surge/glog v0.0.0-20141108051140-2578deb2b95c // indirect
|
||||
github.com/tmthrgd/go-hex v0.0.0-20190904060850-447a3041c3bc // indirect
|
||||
github.com/uptrace/bun v1.1.16 // indirect
|
||||
github.com/uptrace/bun/dialect/sqlitedialect v1.1.16 // indirect
|
||||
github.com/uptrace/bun/extra/bundebug v1.1.16 // indirect
|
||||
github.com/vmihailenco/msgpack/v5 v5.3.5 // indirect
|
||||
github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect
|
||||
golang.org/x/net v0.15.0 // indirect
|
||||
|
|
27
go.sum
27
go.sum
|
@ -8,34 +8,39 @@ github.com/bwmarrin/snowflake v0.3.0 h1:xm67bEhkKh6ij1790JB83OujPR5CzNe8QuQqAgIS
|
|||
github.com/bwmarrin/snowflake v0.3.0/go.mod h1:NdZxfVWX+oR6y2K0o6qAYv6gIOP9rjG0/E9WsDpxqwE=
|
||||
github.com/carlmjohnson/requests v0.23.5 h1:NPANcAofwwSuC6SIMwlgmHry2V3pLrSqRiSBKYbNHHA=
|
||||
github.com/carlmjohnson/requests v0.23.5/go.mod h1:zG9P28thdRnN61aD7iECFhH5iGGKX2jIjKQD9kqYH+o=
|
||||
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs=
|
||||
github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E=
|
||||
github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc=
|
||||
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
|
||||
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
|
||||
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
|
||||
github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
|
||||
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/mattn/go-sqlite3 v1.14.17 h1:mCRHCLDUBXgpKAqIKsaAaAsrAlbkeomtRFKXh2L6YIM=
|
||||
github.com/mattn/go-sqlite3 v1.14.17/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg=
|
||||
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs=
|
||||
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
|
||||
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
||||
github.com/surge/glog v0.0.0-20141108051140-2578deb2b95c h1:cVA8Fd14+bmcDyVutgf976DrV9RzNO4SMzUQmfJDMrw=
|
||||
github.com/surge/glog v0.0.0-20141108051140-2578deb2b95c/go.mod h1:W6gI0HQAbNyEO/62hesTBIbabSGJaEdlUApLw8UtuB0=
|
||||
github.com/tmthrgd/go-hex v0.0.0-20190904060850-447a3041c3bc h1:9lRDQMhESg+zvGYmW5DyG0UqvY96Bu5QYsTLvCHdrgo=
|
||||
github.com/tmthrgd/go-hex v0.0.0-20190904060850-447a3041c3bc/go.mod h1:bciPuU6GHm1iF1pBvUfxfsH0Wmnc2VbpgvbI9ZWuIRs=
|
||||
github.com/uptrace/bun v1.1.16 h1:cn9cgEMFwcyYRsQLfxCRMUxyK1WaHwOVrR3TvzEFZ/A=
|
||||
github.com/uptrace/bun v1.1.16/go.mod h1:7HnsMRRvpLFUcquJxp22JO8PsWKpFQO/gNXqqsuGWg8=
|
||||
github.com/uptrace/bun/dialect/sqlitedialect v1.1.16 h1:gbc9BP/e4sNOB9VBj+Si46dpOz2oktmZPidkda92GYY=
|
||||
github.com/uptrace/bun/dialect/sqlitedialect v1.1.16/go.mod h1:YNezpK7fIn5Wa2WGmTCZ/nEyiswcXmuT4iNWADeL1x4=
|
||||
github.com/uptrace/bun/extra/bundebug v1.1.16 h1:SgicRQGtnjhrIhlYOxdkOm1Em4s6HykmT3JblHnoTBM=
|
||||
github.com/uptrace/bun/extra/bundebug v1.1.16/go.mod h1:SkiOkfUirBiO1Htc4s5bQKEq+JSeU1TkBVpMsPz2ePM=
|
||||
github.com/vmihailenco/msgpack/v5 v5.3.5 h1:5gO0H1iULLWGhs2H5tbAHIZTV8/cYafcFOr9znI5mJU=
|
||||
github.com/vmihailenco/msgpack/v5 v5.3.5/go.mod h1:7xyJ9e+0+9SaZT0Wt1RGleJXzli6Q/V5KbhBonMG9jc=
|
||||
github.com/vmihailenco/tagparser/v2 v2.0.0 h1:y09buUbR+b5aycVFQs/g70pqKVZNBmxwAhO7/IwNM9g=
|
||||
github.com/vmihailenco/tagparser/v2 v2.0.0/go.mod h1:Wri+At7QHww0WTrCBeu4J6bNtoV6mEfg5OIWRZA9qds=
|
||||
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||
github.com/zentures/porter2 v0.0.0-20150829210152-56e4718818e8 h1:2YZN1WHQKIfOweOtevz5t5PiKzPf5nFGdezH1EtAPeM=
|
||||
github.com/zentures/porter2 v0.0.0-20150829210152-56e4718818e8/go.mod h1:DjHUE1+g0AecmviVQYFvsOHlA95D/Rs1mnPK8f81wu4=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
|
||||
|
@ -54,9 +59,7 @@ golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7w
|
|||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o=
|
||||
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
|
@ -72,6 +75,8 @@ golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtn
|
|||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU=
|
||||
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
|
|
41
index/internal/config/config.go
Normal file
41
index/internal/config/config.go
Normal file
|
@ -0,0 +1,41 @@
|
|||
package config
|
||||
|
||||
import (
|
||||
"git.tdpain.net/pkg/cfger"
|
||||
"log/slog"
|
||||
"os"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
DatabaseName string
|
||||
CrawlDataDir string
|
||||
}
|
||||
|
||||
var (
|
||||
conf *Config
|
||||
loadOnce = new(sync.Once)
|
||||
)
|
||||
|
||||
func Get() *Config {
|
||||
var outerErr error
|
||||
loadOnce.Do(func() {
|
||||
cl := cfger.New()
|
||||
if err := cl.Load("config.yml"); err != nil {
|
||||
outerErr = err
|
||||
return
|
||||
}
|
||||
|
||||
conf = &Config{
|
||||
DatabaseName: cl.WithDefault("index.databaseName", "index.db").AsString(),
|
||||
CrawlDataDir: cl.WithDefault("dataDir", "crawlData").AsString(),
|
||||
}
|
||||
})
|
||||
|
||||
if outerErr != nil {
|
||||
slog.Error("fatal error when loading configuration", "err", outerErr)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
return conf
|
||||
}
|
60
index/internal/database/db.go
Normal file
60
index/internal/database/db.go
Normal file
|
@ -0,0 +1,60 @@
|
|||
package database
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"git.tdpain.net/codemicro/hn84/util"
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
"github.com/uptrace/bun"
|
||||
"github.com/uptrace/bun/dialect/sqlitedialect"
|
||||
"os"
|
||||
)
|
||||
|
||||
type Document struct {
|
||||
bun.BaseModel
|
||||
|
||||
ID string `bun:",pk"`
|
||||
URL string
|
||||
Title string
|
||||
}
|
||||
|
||||
type Token struct {
|
||||
bun.BaseModel
|
||||
|
||||
Token string
|
||||
DocumentID string
|
||||
Start, End int
|
||||
}
|
||||
|
||||
func Setup(filepath string) (*bun.DB, error) {
|
||||
alreadyExists := true
|
||||
if _, err := os.Stat(filepath); err != nil {
|
||||
if !errors.Is(err, os.ErrNotExist) {
|
||||
return nil, err
|
||||
}
|
||||
alreadyExists = false
|
||||
}
|
||||
|
||||
db, err := sql.Open("sqlite3", filepath)
|
||||
if err != nil {
|
||||
return nil, util.Wrap("open database", err)
|
||||
}
|
||||
|
||||
db.SetMaxOpenConns(1) // https://github.com/mattn/go-sqlite3/issues/274#issuecomment-191597862
|
||||
|
||||
b := bun.NewDB(db, sqlitedialect.New())
|
||||
//b.AddQueryHook(bundebug.NewQueryHook(bundebug.WithVerbose(true)))
|
||||
|
||||
if !alreadyExists {
|
||||
if _, err := b.NewCreateTable().Model((*Document)(nil)).Exec(context.Background()); err != nil {
|
||||
return nil, util.Wrap("create Document table", err)
|
||||
}
|
||||
|
||||
if _, err := b.NewCreateTable().Model((*Token)(nil)).Exec(context.Background()); err != nil {
|
||||
return nil, util.Wrap("create Token table", err)
|
||||
}
|
||||
}
|
||||
|
||||
return b, nil
|
||||
}
|
220
index/main.go
Normal file
220
index/main.go
Normal file
|
@ -0,0 +1,220 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"git.tdpain.net/codemicro/hn84/index/internal/config"
|
||||
"git.tdpain.net/codemicro/hn84/index/internal/database"
|
||||
"git.tdpain.net/codemicro/hn84/util"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/uptrace/bun"
|
||||
"github.com/zentures/porter2"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func main() {
|
||||
if err := run(); err != nil {
|
||||
slog.Error("unrecoverable runtime error", "error", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
func run() error {
|
||||
db, err := database.Setup(config.Get().DatabaseName)
|
||||
if err != nil {
|
||||
return util.Wrap("setup database", err)
|
||||
}
|
||||
|
||||
if err := walkDir(db, config.Get().CrawlDataDir); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func walkDir(db *bun.DB, dir string) error {
|
||||
de, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return util.Wrap("read data dir", err)
|
||||
}
|
||||
|
||||
for _, entry := range de {
|
||||
name := entry.Name()
|
||||
if !strings.HasSuffix(name, "html") {
|
||||
continue
|
||||
}
|
||||
|
||||
id := name[:len(name)-5]
|
||||
|
||||
// Process tokens
|
||||
|
||||
htmlContent, err := os.ReadFile(path.Join(dir, name))
|
||||
if err != nil {
|
||||
return util.Wrap("read HTML file", err)
|
||||
}
|
||||
|
||||
plaintext, pageTitle, err := convertHTMLToPlaintext(string(htmlContent))
|
||||
if err != nil {
|
||||
return util.Wrap("convert HTML to plaintext", err)
|
||||
}
|
||||
|
||||
plaintext = filterPlaintextCharacters(plaintext)
|
||||
tokens := tokenise(plaintext)
|
||||
tokens = filterStopwords(tokens)
|
||||
stemTokens(tokens)
|
||||
|
||||
dbTokens := convertToDatabaseTokens(tokens, id)
|
||||
if _, err := db.NewInsert().Model(&dbTokens).Exec(context.Background()); err != nil {
|
||||
return util.Wrap("unable to insert tokens to database", err)
|
||||
}
|
||||
|
||||
// Dump plaintext to file
|
||||
if err := os.WriteFile(path.Join(dir, id+".txt"), []byte(plaintext), 0466); err != nil {
|
||||
return util.Wrap("write plaintext", err)
|
||||
}
|
||||
|
||||
// Read extra data
|
||||
var dat = struct {
|
||||
URL string
|
||||
}{}
|
||||
|
||||
jsonBytes, err := os.ReadFile(path.Join(dir, id+".json"))
|
||||
if err != nil {
|
||||
return util.Wrap("read document info", err)
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(jsonBytes, &dat); err != nil {
|
||||
return util.Wrap("unmarshal document info", err)
|
||||
}
|
||||
|
||||
if _, err := db.NewInsert().Model(&database.Document{
|
||||
ID: id,
|
||||
URL: dat.URL,
|
||||
Title: pageTitle,
|
||||
}).Exec(context.Background()); err != nil {
|
||||
return util.Wrap("insert document to database", err)
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
type intermediateToken struct {
|
||||
Val string
|
||||
Start, End int
|
||||
}
|
||||
|
||||
func convertHTMLToPlaintext(htmlStr string) (string, string, error) {
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewBufferString(htmlStr))
|
||||
if err != nil {
|
||||
return "", "", util.Wrap("load HTML into goquery", err)
|
||||
}
|
||||
|
||||
var titleStr string
|
||||
title := doc.Find("title")
|
||||
if len(title.Nodes) != 0 {
|
||||
titleStr = strings.TrimSpace(title.Text())
|
||||
}
|
||||
|
||||
return titleStr + " " + strings.TrimSpace(doc.Find("body").Text()), titleStr, nil
|
||||
}
|
||||
|
||||
func tokenise(plaintext string) []*intermediateToken {
|
||||
previousSpace := -1
|
||||
var tok []*intermediateToken
|
||||
pln := len(plaintext)
|
||||
for i, char := range plaintext {
|
||||
if char == ' ' || i == pln-1 {
|
||||
end := i - 1
|
||||
if char != ' ' {
|
||||
end += 1
|
||||
i += 1
|
||||
}
|
||||
tok = append(tok, &intermediateToken{
|
||||
Val: strings.ToLower(plaintext[previousSpace+1 : i]),
|
||||
Start: previousSpace + 1,
|
||||
End: end,
|
||||
})
|
||||
previousSpace = i
|
||||
}
|
||||
}
|
||||
return tok
|
||||
}
|
||||
|
||||
func filterStopwords(tokens []*intermediateToken) []*intermediateToken {
|
||||
n := 0
|
||||
for _, tok := range tokens {
|
||||
_, found := stopwords[tok.Val]
|
||||
if !found {
|
||||
tokens[n] = tok
|
||||
n += 1
|
||||
}
|
||||
}
|
||||
return tokens[:n]
|
||||
}
|
||||
|
||||
func stemTokens(tokens []*intermediateToken) {
|
||||
for _, tok := range tokens {
|
||||
tok.Val = porter2.Stem(tok.Val)
|
||||
}
|
||||
}
|
||||
|
||||
func convertToDatabaseTokens(tokens []*intermediateToken, documentID string) []*database.Token {
|
||||
var res []*database.Token
|
||||
for _, tok := range tokens {
|
||||
res = append(res, &database.Token{
|
||||
Token: tok.Val,
|
||||
DocumentID: documentID,
|
||||
Start: tok.Start,
|
||||
End: tok.End,
|
||||
})
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func filterPlaintextCharacters(plaintext string) string {
|
||||
arr := []rune(plaintext)
|
||||
n := 0
|
||||
for _, char := range arr {
|
||||
if ('A' <= char && char <= 'Z') || ('a' <= char && char <= 'z') || char == ' ' || ('0' <= char && char <= '9') {
|
||||
arr[n] = char
|
||||
n += 1
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(strings.Fields(string(arr[:n])), " ")
|
||||
}
|
||||
|
||||
var stopwords = map[string]struct{}{
|
||||
"the": {},
|
||||
"be": {},
|
||||
"to": {},
|
||||
"of": {},
|
||||
"and": {},
|
||||
"a": {},
|
||||
"in": {},
|
||||
"that": {},
|
||||
"have": {},
|
||||
"I": {},
|
||||
"it": {},
|
||||
"for": {},
|
||||
"not": {},
|
||||
"on": {},
|
||||
"with": {},
|
||||
"he": {},
|
||||
"as": {},
|
||||
"you": {},
|
||||
"do": {},
|
||||
"at": {},
|
||||
"this": {},
|
||||
"but": {},
|
||||
"his": {},
|
||||
"by": {},
|
||||
"from": {},
|
||||
}
|
Reference in a new issue