diff --git a/ui/internal/config/config.go b/ui/internal/config/config.go new file mode 100644 index 0000000..b8d73c0 --- /dev/null +++ b/ui/internal/config/config.go @@ -0,0 +1,41 @@ +package config + +import ( + "git.tdpain.net/pkg/cfger" + "log/slog" + "os" + "sync" +) + +type Config struct { + DatabaseName string + CrawlDataDir string +} + +var ( + conf *Config + loadOnce = new(sync.Once) +) + +func Get() *Config { + var outerErr error + loadOnce.Do(func() { + cl := cfger.New() + if err := cl.Load("config.yml"); err != nil { + outerErr = err + return + } + + conf = &Config{ + DatabaseName: cl.WithDefault("index.databaseName", "index.db").AsString(), + CrawlDataDir: cl.WithDefault("dataDir", "crawlData").AsString(), + } + }) + + if outerErr != nil { + slog.Error("fatal error when loading configuration", "err", outerErr) + os.Exit(1) + } + + return conf +} diff --git a/ui/internal/database/database.go b/ui/internal/database/database.go new file mode 100644 index 0000000..a0e5b55 --- /dev/null +++ b/ui/internal/database/database.go @@ -0,0 +1,53 @@ +package database + +import ( + "database/sql" + "errors" + "git.tdpain.net/codemicro/hn84/util" + _ "github.com/mattn/go-sqlite3" + "github.com/uptrace/bun" + "github.com/uptrace/bun/dialect/sqlitedialect" + "os" +) + +type Document struct { + bun.BaseModel + + ID string `bun:",pk"` + URL string + Title string +} + +type Token struct { + bun.BaseModel + + Token string + DocumentID string + Start, End int +} + +func Setup(filepath string) (*bun.DB, error) { + alreadyExists := true + if _, err := os.Stat(filepath); err != nil { + if !errors.Is(err, os.ErrNotExist) { + return nil, err + } + alreadyExists = false + } + + db, err := sql.Open("sqlite3", filepath) + if err != nil { + return nil, util.Wrap("open database", err) + } + + db.SetMaxOpenConns(1) // https://github.com/mattn/go-sqlite3/issues/274#issuecomment-191597862 + + b := bun.NewDB(db, sqlitedialect.New()) + //b.AddQueryHook(bundebug.NewQueryHook(bundebug.WithVerbose(true))) + + if !alreadyExists { + return nil, errors.New("cannot create database from new in ui") + } + + return b, nil +} diff --git a/ui/internal/search/search.go b/ui/internal/search/search.go new file mode 100644 index 0000000..895e30b --- /dev/null +++ b/ui/internal/search/search.go @@ -0,0 +1,190 @@ +package search + +import ( + "context" + "git.tdpain.net/codemicro/hn84/ui/internal/database" + "git.tdpain.net/codemicro/hn84/util" + "github.com/uptrace/bun" + "github.com/zentures/porter2" + "math" + "sort" + "strings" +) + +type Match struct { + Document *database.Document + Tokens []*database.Token + Ranking float64 +} + +func DoSearch(db *bun.DB, query []string) ([]*Match, error) { + query = util.Deduplicate(query) + + var tokens []*database.Token + if err := db.NewSelect().Model(&tokens).Where("token in (?)", bun.In(query)).Scan(context.Background(), &tokens); err != nil { + return nil, util.Wrap("unable to execute query on database", err) + } + + tokensByDocument := make(map[string][]*database.Token) + + for _, token := range tokens { + tokensByDocument[token.DocumentID] = append(tokensByDocument[token.DocumentID], token) + } + + // each document must contain all tokens + var docsToDelete []string + for doc, tokens := range tokensByDocument { + seen := make(map[string]struct{}) + for _, tok := range tokens { + seen[tok.Token] = struct{}{} + } + if len(seen) != len(query) { + docsToDelete = append(docsToDelete, doc) + } + } + + for _, doc := range docsToDelete { + delete(tokensByDocument, doc) + } + + termFrequencies := make(map[string]map[string]int) + for doc, tokens := range tokensByDocument { + freqs := make(map[string]int) + for _, tok := range tokens { + freqs[tok.Token] = freqs[tok.Token] + 1 + } + termFrequencies[doc] = freqs + } + + var totalNumDocs float64 + { + tnd, err := db.NewSelect().Model((*database.Document)(nil)).Count(context.Background()) + if err != nil { + return nil, util.Wrap("count all documents", err) + } + totalNumDocs = float64(tnd) + } + + idfs := make(map[string]float64) + for _, tokStr := range query { + var occursInN int + err := db.NewRaw(`SELECT COUNT(*) FROM (SELECT '' FROM tokens WHERE token = ? GROUP BY "document_id")`, tokStr).Scan(context.Background(), &occursInN) + if err != nil { + return nil, util.Wrap("count number of documents that term occurs in", err) + } + idfs[tokStr] = math.Log(totalNumDocs / float64(occursInN)) + } + + var res []*Match + + for docID, tokens := range tokensByDocument { + doc := new(database.Document) + if err := db.NewSelect().Model(doc).Where("id = ?", docID).Scan(context.Background(), doc); err != nil { + return nil, util.Wrap("final assembly", err) + } + + m := &Match{ + Document: doc, + Tokens: tokens, + } + + freq := termFrequencies[docID] + + for _, tok := range tokens { + m.Ranking += float64(freq[tok.Token]) * idfs[tok.Token] + } + + res = append(res, m) + } + + sort.Slice(res, func(i, j int) bool { + return res[i].Ranking > res[j].Ranking + }) + + return res, nil +} + +func PlaintextToTokens(plain string) []string { + plain = filterPlaintextCharacters(plain) + tokens := tokenise(plain) + tokens = filterStopwords(tokens) + stemTokens(tokens) + return tokens +} + +func tokenise(plaintext string) []string { + previousSpace := -1 + var tok []string + pln := len(plaintext) + for i, char := range plaintext { + if char == ' ' || i == pln-1 { + end := i - 1 + if char != ' ' { + end += 1 + i += 1 + } + tok = append(tok, strings.ToLower(plaintext[previousSpace+1:i])) + previousSpace = i + } + } + return tok +} + +func filterStopwords(tokens []string) []string { + n := 0 + for _, tok := range tokens { + _, found := stopwords[tok] + if !found { + tokens[n] = tok + n += 1 + } + } + return tokens[:n] +} + +func stemTokens(tokens []string) { + for i, tok := range tokens { + tokens[i] = porter2.Stem(tok) + } +} + +func filterPlaintextCharacters(plaintext string) string { + arr := []rune(plaintext) + n := 0 + for _, char := range arr { + if ('A' <= char && char <= 'Z') || ('a' <= char && char <= 'z') || char == ' ' || ('0' <= char && char <= '9') { + arr[n] = char + n += 1 + } + } + + return strings.Join(strings.Fields(string(arr[:n])), " ") +} + +var stopwords = map[string]struct{}{ + "the": {}, + "be": {}, + "to": {}, + "of": {}, + "and": {}, + "a": {}, + "in": {}, + "that": {}, + "have": {}, + "I": {}, + "it": {}, + "for": {}, + "not": {}, + "on": {}, + "with": {}, + "he": {}, + "as": {}, + "you": {}, + "do": {}, + "at": {}, + "this": {}, + "but": {}, + "his": {}, + "by": {}, + "from": {}, +} diff --git a/ui/main.go b/ui/main.go new file mode 100644 index 0000000..e30f4d9 --- /dev/null +++ b/ui/main.go @@ -0,0 +1,38 @@ +package main + +import ( + "fmt" + "git.tdpain.net/codemicro/hn84/ui/internal/config" + "git.tdpain.net/codemicro/hn84/ui/internal/database" + "git.tdpain.net/codemicro/hn84/ui/internal/search" + "git.tdpain.net/codemicro/hn84/util" + "log/slog" +) + +func main() { + if err := run(); err != nil { + slog.Error("unhandled error", "error", err) + } +} + +func run() error { + db, err := database.Setup(config.Get().DatabaseName) + if err != nil { + return util.Wrap("setup database", err) + } + + query := search.PlaintextToTokens("reading list") + + matches, err := search.DoSearch(db, query) + if err != nil { + return util.Wrap("run search", err) + } + + fmt.Println(query) + + for _, m := range matches { + fmt.Println(m.Document.Title, m.Ranking) + } + + return nil +} diff --git a/util/util.go b/util/util.go index 2598b16..6352195 100644 --- a/util/util.go +++ b/util/util.go @@ -7,3 +7,15 @@ import ( func Wrap(label string, err error) error { return fmt.Errorf("%s: %w", label, err) } + +func Deduplicate[T comparable](sliceList []T) []T { + allKeys := make(map[T]struct{}) + var list []T + for _, item := range sliceList { + if _, value := allKeys[item]; !value { + allKeys[item] = struct{}{} + list = append(list, item) + } + } + return list +}