add rutor api

2025-12-19 13:36:09 +05:00 · 2023-01-29 20:04:24 +03:00
parent d71c42936d
commit fbd1765331
7 changed files with 428 additions and 0 deletions
--- a/server/rutor/torrsearch/filter.go
+++ b/server/rutor/torrsearch/filter.go
@@ -0,0 +1,99 @@
+package torrsearch
+
+import (
+	"strings"
+
+	snowballeng "github.com/kljensen/snowball/english"
+	snowballru "github.com/kljensen/snowball/russian"
+)
+
+// lowercaseFilter returns a slice of tokens normalized to lower case.
+func lowercaseFilter(tokens []string) []string {
+	r := make([]string, len(tokens))
+	for i, token := range tokens {
+		r[i] = replaceChars(strings.ToLower(token))
+	}
+	return r
+}
+
+// stopwordFilter returns a slice of tokens with stop words removed.
+func stopwordFilter(tokens []string) []string {
+	r := make([]string, 0, len(tokens))
+	for _, token := range tokens {
+		if !isStopWord(token) {
+			r = append(r, token)
+		}
+	}
+	return r
+}
+
+// stemmerFilter returns a slice of stemmed tokens.
+func stemmerFilter(tokens []string) []string {
+	r := make([]string, len(tokens))
+	for i, token := range tokens {
+		worden := snowballeng.Stem(token, false)
+		wordru := snowballru.Stem(token, false)
+		if wordru == "" || worden == "" {
+			continue
+		}
+		if wordru != token {
+			r[i] = wordru
+		} else {
+			r[i] = worden
+		}
+	}
+	return r
+}
+
+func replaceChars(word string) string {
+	out := []rune(word)
+	for i, r := range out {
+		if r == 'ё' {
+			out[i] = 'е'
+		}
+	}
+	return string(out)
+}
+
+func isStopWord(word string) bool {
+	switch word {
+	case "a", "about", "above", "after", "again", "against", "all", "am", "an",
+		"and", "any", "are", "as", "at", "be", "because", "been", "before",
+		"being", "below", "between", "both", "but", "by", "can", "did", "do",
+		"does", "doing", "don", "down", "during", "each", "few", "for", "from",
+		"further", "had", "has", "have", "having", "he", "her", "here", "hers",
+		"herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is",
+		"it", "its", "itself", "just", "me", "more", "most", "my", "myself",
+		"no", "nor", "not", "now", "of", "off", "on", "once", "only", "or",
+		"other", "our", "ours", "ourselves", "out", "over", "own", "s", "same",
+		"she", "should", "so", "some", "such", "t", "than", "that", "the", "their",
+		"theirs", "them", "themselves", "then", "there", "these", "they",
+		"this", "those", "through", "to", "too", "under", "until", "up",
+		"very", "was", "we", "were", "what", "when", "where", "which", "while",
+		"who", "whom", "why", "will", "with", "you", "your", "yours", "yourself",
+		"yourselves", "и", "в", "во", "не", "что", "он", "на", "я", "с",
+		"со", "как", "а", "то", "все", "она", "так", "его",
+		"но", "да", "ты", "к", "у", "же", "вы", "за", "бы",
+		"по", "только", "ее", "мне", "было", "вот", "от",
+		"меня", "еще", "нет", "о", "из", "ему", "теперь",
+		"когда", "даже", "ну", "вдруг", "ли", "если", "уже",
+		"или", "ни", "быть", "был", "него", "до", "вас",
+		"нибудь", "опять", "уж", "вам", "ведь", "там", "потом",
+		"себя", "ничего", "ей", "может", "они", "тут", "где",
+		"есть", "надо", "ней", "для", "мы", "тебя", "их",
+		"чем", "была", "сам", "чтоб", "без", "будто", "чего",
+		"раз", "тоже", "себе", "под", "будет", "ж", "тогда",
+		"кто", "этот", "того", "потому", "этого", "какой",
+		"совсем", "ним", "здесь", "этом", "один", "почти",
+		"мой", "тем", "чтобы", "нее", "сейчас", "были", "куда",
+		"зачем", "всех", "никогда", "можно", "при", "наконец",
+		"два", "об", "другой", "хоть", "после", "над", "больше",
+		"тот", "через", "эти", "нас", "про", "всего", "них",
+		"какая", "много", "разве", "три", "эту", "моя",
+		"впрочем", "хорошо", "свою", "этой", "перед", "иногда",
+		"лучше", "чуть", "том", "нельзя", "такой", "им", "более",
+		"всегда", "конечно", "всю", "между":
+		return true
+	}
+	return false
+}
--- a/server/rutor/torrsearch/index.go
+++ b/server/rutor/torrsearch/index.go
@@ -0,0 +1,76 @@
+package torrsearch
+
+import (
+	"log"
+	"server/rutor/models"
+	"strconv"
+)
+
+// Index is an inverted Index. It maps tokens to document IDs.
+type Index map[string][]int
+
+var idx Index
+
+func NewIndex(torrs []*models.TorrentDetails) {
+	log.Println("Index torrs")
+	idx = make(Index)
+	idx.add(torrs)
+}
+
+func Search(text string) []int {
+	return idx.search(text)
+}
+
+func (idx Index) add(torrs []*models.TorrentDetails) {
+	for ID, torr := range torrs {
+		for _, token := range analyze(torr.Name + " " + torr.GetNames() + " " + strconv.Itoa(torr.Year)) {
+			ids := idx[token]
+			if ids != nil && ids[len(ids)-1] == ID {
+				// Don't add same ID twice.
+				continue
+			}
+			idx[token] = append(ids, ID)
+		}
+	}
+}
+
+// intersection returns the set intersection between a and b.
+// a and b have to be sorted in ascending order and contain no duplicates.
+func intersection(a []int, b []int) []int {
+	maxLen := len(a)
+	if len(b) > maxLen {
+		maxLen = len(b)
+	}
+	r := make([]int, 0, maxLen)
+	var i, j int
+	for i < len(a) && j < len(b) {
+		if a[i] < b[j] {
+			i++
+		} else if a[i] > b[j] {
+			j++
+		} else {
+			r = append(r, a[i])
+			i++
+			j++
+		}
+	}
+	return r
+}
+
+// Search queries the Index for the given text.
+func (idx Index) search(text string) []int {
+	var r []int
+	for _, token := range analyze(text) {
+		if ids, ok := idx[token]; ok {
+			if r == nil {
+				r = ids
+			} else {
+				r = intersection(r, ids)
+			}
+		} else {
+			// Token doesn't exist.
+			return nil
+		}
+	}
+	return r
+}
--- a/server/rutor/torrsearch/tokenizer.go
+++ b/server/rutor/torrsearch/tokenizer.go
@@ -0,0 +1,23 @@
+package torrsearch
+
+import (
+	"strings"
+	"unicode"
+)
+
+// tokenize returns a slice of tokens for the given text.
+func tokenize(text string) []string {
+	return strings.FieldsFunc(text, func(r rune) bool {
+		// Split on any character that is not a letter or a number.
+		return !unicode.IsLetter(r) && !unicode.IsNumber(r)
+	})
+}
+
+// analyze analyzes the text and returns a slice of tokens.
+func analyze(text string) []string {
+	tokens := tokenize(text)
+	tokens = lowercaseFilter(tokens)
+	tokens = stopwordFilter(tokens)
+	tokens = stemmerFilter(tokens)
+	return tokens
+}