HOLY SHIT IT SEARCHES
This commit is contained in:
parent
c12593b0bf
commit
3e38376eef
5 changed files with 334 additions and 0 deletions
41
ui/internal/config/config.go
Normal file
41
ui/internal/config/config.go
Normal file
|
@ -0,0 +1,41 @@
|
|||
package config
|
||||
|
||||
import (
|
||||
"git.tdpain.net/pkg/cfger"
|
||||
"log/slog"
|
||||
"os"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
DatabaseName string
|
||||
CrawlDataDir string
|
||||
}
|
||||
|
||||
var (
|
||||
conf *Config
|
||||
loadOnce = new(sync.Once)
|
||||
)
|
||||
|
||||
func Get() *Config {
|
||||
var outerErr error
|
||||
loadOnce.Do(func() {
|
||||
cl := cfger.New()
|
||||
if err := cl.Load("config.yml"); err != nil {
|
||||
outerErr = err
|
||||
return
|
||||
}
|
||||
|
||||
conf = &Config{
|
||||
DatabaseName: cl.WithDefault("index.databaseName", "index.db").AsString(),
|
||||
CrawlDataDir: cl.WithDefault("dataDir", "crawlData").AsString(),
|
||||
}
|
||||
})
|
||||
|
||||
if outerErr != nil {
|
||||
slog.Error("fatal error when loading configuration", "err", outerErr)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
return conf
|
||||
}
|
53
ui/internal/database/database.go
Normal file
53
ui/internal/database/database.go
Normal file
|
@ -0,0 +1,53 @@
|
|||
package database
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"git.tdpain.net/codemicro/hn84/util"
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
"github.com/uptrace/bun"
|
||||
"github.com/uptrace/bun/dialect/sqlitedialect"
|
||||
"os"
|
||||
)
|
||||
|
||||
type Document struct {
|
||||
bun.BaseModel
|
||||
|
||||
ID string `bun:",pk"`
|
||||
URL string
|
||||
Title string
|
||||
}
|
||||
|
||||
type Token struct {
|
||||
bun.BaseModel
|
||||
|
||||
Token string
|
||||
DocumentID string
|
||||
Start, End int
|
||||
}
|
||||
|
||||
func Setup(filepath string) (*bun.DB, error) {
|
||||
alreadyExists := true
|
||||
if _, err := os.Stat(filepath); err != nil {
|
||||
if !errors.Is(err, os.ErrNotExist) {
|
||||
return nil, err
|
||||
}
|
||||
alreadyExists = false
|
||||
}
|
||||
|
||||
db, err := sql.Open("sqlite3", filepath)
|
||||
if err != nil {
|
||||
return nil, util.Wrap("open database", err)
|
||||
}
|
||||
|
||||
db.SetMaxOpenConns(1) // https://github.com/mattn/go-sqlite3/issues/274#issuecomment-191597862
|
||||
|
||||
b := bun.NewDB(db, sqlitedialect.New())
|
||||
//b.AddQueryHook(bundebug.NewQueryHook(bundebug.WithVerbose(true)))
|
||||
|
||||
if !alreadyExists {
|
||||
return nil, errors.New("cannot create database from new in ui")
|
||||
}
|
||||
|
||||
return b, nil
|
||||
}
|
190
ui/internal/search/search.go
Normal file
190
ui/internal/search/search.go
Normal file
|
@ -0,0 +1,190 @@
|
|||
package search
|
||||
|
||||
import (
|
||||
"context"
|
||||
"git.tdpain.net/codemicro/hn84/ui/internal/database"
|
||||
"git.tdpain.net/codemicro/hn84/util"
|
||||
"github.com/uptrace/bun"
|
||||
"github.com/zentures/porter2"
|
||||
"math"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type Match struct {
|
||||
Document *database.Document
|
||||
Tokens []*database.Token
|
||||
Ranking float64
|
||||
}
|
||||
|
||||
func DoSearch(db *bun.DB, query []string) ([]*Match, error) {
|
||||
query = util.Deduplicate(query)
|
||||
|
||||
var tokens []*database.Token
|
||||
if err := db.NewSelect().Model(&tokens).Where("token in (?)", bun.In(query)).Scan(context.Background(), &tokens); err != nil {
|
||||
return nil, util.Wrap("unable to execute query on database", err)
|
||||
}
|
||||
|
||||
tokensByDocument := make(map[string][]*database.Token)
|
||||
|
||||
for _, token := range tokens {
|
||||
tokensByDocument[token.DocumentID] = append(tokensByDocument[token.DocumentID], token)
|
||||
}
|
||||
|
||||
// each document must contain all tokens
|
||||
var docsToDelete []string
|
||||
for doc, tokens := range tokensByDocument {
|
||||
seen := make(map[string]struct{})
|
||||
for _, tok := range tokens {
|
||||
seen[tok.Token] = struct{}{}
|
||||
}
|
||||
if len(seen) != len(query) {
|
||||
docsToDelete = append(docsToDelete, doc)
|
||||
}
|
||||
}
|
||||
|
||||
for _, doc := range docsToDelete {
|
||||
delete(tokensByDocument, doc)
|
||||
}
|
||||
|
||||
termFrequencies := make(map[string]map[string]int)
|
||||
for doc, tokens := range tokensByDocument {
|
||||
freqs := make(map[string]int)
|
||||
for _, tok := range tokens {
|
||||
freqs[tok.Token] = freqs[tok.Token] + 1
|
||||
}
|
||||
termFrequencies[doc] = freqs
|
||||
}
|
||||
|
||||
var totalNumDocs float64
|
||||
{
|
||||
tnd, err := db.NewSelect().Model((*database.Document)(nil)).Count(context.Background())
|
||||
if err != nil {
|
||||
return nil, util.Wrap("count all documents", err)
|
||||
}
|
||||
totalNumDocs = float64(tnd)
|
||||
}
|
||||
|
||||
idfs := make(map[string]float64)
|
||||
for _, tokStr := range query {
|
||||
var occursInN int
|
||||
err := db.NewRaw(`SELECT COUNT(*) FROM (SELECT '' FROM tokens WHERE token = ? GROUP BY "document_id")`, tokStr).Scan(context.Background(), &occursInN)
|
||||
if err != nil {
|
||||
return nil, util.Wrap("count number of documents that term occurs in", err)
|
||||
}
|
||||
idfs[tokStr] = math.Log(totalNumDocs / float64(occursInN))
|
||||
}
|
||||
|
||||
var res []*Match
|
||||
|
||||
for docID, tokens := range tokensByDocument {
|
||||
doc := new(database.Document)
|
||||
if err := db.NewSelect().Model(doc).Where("id = ?", docID).Scan(context.Background(), doc); err != nil {
|
||||
return nil, util.Wrap("final assembly", err)
|
||||
}
|
||||
|
||||
m := &Match{
|
||||
Document: doc,
|
||||
Tokens: tokens,
|
||||
}
|
||||
|
||||
freq := termFrequencies[docID]
|
||||
|
||||
for _, tok := range tokens {
|
||||
m.Ranking += float64(freq[tok.Token]) * idfs[tok.Token]
|
||||
}
|
||||
|
||||
res = append(res, m)
|
||||
}
|
||||
|
||||
sort.Slice(res, func(i, j int) bool {
|
||||
return res[i].Ranking > res[j].Ranking
|
||||
})
|
||||
|
||||
return res, nil
|
||||
}
|
||||
|
||||
func PlaintextToTokens(plain string) []string {
|
||||
plain = filterPlaintextCharacters(plain)
|
||||
tokens := tokenise(plain)
|
||||
tokens = filterStopwords(tokens)
|
||||
stemTokens(tokens)
|
||||
return tokens
|
||||
}
|
||||
|
||||
func tokenise(plaintext string) []string {
|
||||
previousSpace := -1
|
||||
var tok []string
|
||||
pln := len(plaintext)
|
||||
for i, char := range plaintext {
|
||||
if char == ' ' || i == pln-1 {
|
||||
end := i - 1
|
||||
if char != ' ' {
|
||||
end += 1
|
||||
i += 1
|
||||
}
|
||||
tok = append(tok, strings.ToLower(plaintext[previousSpace+1:i]))
|
||||
previousSpace = i
|
||||
}
|
||||
}
|
||||
return tok
|
||||
}
|
||||
|
||||
func filterStopwords(tokens []string) []string {
|
||||
n := 0
|
||||
for _, tok := range tokens {
|
||||
_, found := stopwords[tok]
|
||||
if !found {
|
||||
tokens[n] = tok
|
||||
n += 1
|
||||
}
|
||||
}
|
||||
return tokens[:n]
|
||||
}
|
||||
|
||||
func stemTokens(tokens []string) {
|
||||
for i, tok := range tokens {
|
||||
tokens[i] = porter2.Stem(tok)
|
||||
}
|
||||
}
|
||||
|
||||
func filterPlaintextCharacters(plaintext string) string {
|
||||
arr := []rune(plaintext)
|
||||
n := 0
|
||||
for _, char := range arr {
|
||||
if ('A' <= char && char <= 'Z') || ('a' <= char && char <= 'z') || char == ' ' || ('0' <= char && char <= '9') {
|
||||
arr[n] = char
|
||||
n += 1
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(strings.Fields(string(arr[:n])), " ")
|
||||
}
|
||||
|
||||
var stopwords = map[string]struct{}{
|
||||
"the": {},
|
||||
"be": {},
|
||||
"to": {},
|
||||
"of": {},
|
||||
"and": {},
|
||||
"a": {},
|
||||
"in": {},
|
||||
"that": {},
|
||||
"have": {},
|
||||
"I": {},
|
||||
"it": {},
|
||||
"for": {},
|
||||
"not": {},
|
||||
"on": {},
|
||||
"with": {},
|
||||
"he": {},
|
||||
"as": {},
|
||||
"you": {},
|
||||
"do": {},
|
||||
"at": {},
|
||||
"this": {},
|
||||
"but": {},
|
||||
"his": {},
|
||||
"by": {},
|
||||
"from": {},
|
||||
}
|
38
ui/main.go
Normal file
38
ui/main.go
Normal file
|
@ -0,0 +1,38 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"git.tdpain.net/codemicro/hn84/ui/internal/config"
|
||||
"git.tdpain.net/codemicro/hn84/ui/internal/database"
|
||||
"git.tdpain.net/codemicro/hn84/ui/internal/search"
|
||||
"git.tdpain.net/codemicro/hn84/util"
|
||||
"log/slog"
|
||||
)
|
||||
|
||||
func main() {
|
||||
if err := run(); err != nil {
|
||||
slog.Error("unhandled error", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
func run() error {
|
||||
db, err := database.Setup(config.Get().DatabaseName)
|
||||
if err != nil {
|
||||
return util.Wrap("setup database", err)
|
||||
}
|
||||
|
||||
query := search.PlaintextToTokens("reading list")
|
||||
|
||||
matches, err := search.DoSearch(db, query)
|
||||
if err != nil {
|
||||
return util.Wrap("run search", err)
|
||||
}
|
||||
|
||||
fmt.Println(query)
|
||||
|
||||
for _, m := range matches {
|
||||
fmt.Println(m.Document.Title, m.Ranking)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
12
util/util.go
12
util/util.go
|
@ -7,3 +7,15 @@ import (
|
|||
func Wrap(label string, err error) error {
|
||||
return fmt.Errorf("%s: %w", label, err)
|
||||
}
|
||||
|
||||
func Deduplicate[T comparable](sliceList []T) []T {
|
||||
allKeys := make(map[T]struct{})
|
||||
var list []T
|
||||
for _, item := range sliceList {
|
||||
if _, value := allKeys[item]; !value {
|
||||
allKeys[item] = struct{}{}
|
||||
list = append(list, item)
|
||||
}
|
||||
}
|
||||
return list
|
||||
}
|
||||
|
|
Reference in a new issue