Skip to content

Commit

Permalink
Extract matched context from markdown tables
Browse files Browse the repository at this point in the history
* Searches will match content using case insensitive matching.
* Matched table cell extracted to avoid being flooded with text in
  results list
  • Loading branch information
msp301 committed Nov 9, 2023
1 parent 5dc9281 commit b71c1a2
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 44 deletions.
37 changes: 2 additions & 35 deletions notebook/search.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
package notebook

import (
"github.com/lithammer/fuzzysearch/fuzzy"
"github.com/msp301/zb/graph"
"github.com/msp301/zb/parser"
"github.com/msp301/zb/util"
"regexp"
"strings"
)

Expand All @@ -23,7 +21,7 @@ func (book *Notebook) Search(query ...string) []Result {
for _, paragraph := range paragraphs {
termsMatched := 0
for _, term := range query {
if matches(paragraph, term) {
if util.ParagraphMatches(paragraph, term) {
termsMatched++
}
}
Expand All @@ -33,7 +31,7 @@ func (book *Notebook) Search(query ...string) []Result {
}

for _, term := range query {
extracted, ok := util.Context(paragraph, term)
extracted, ok := util.ContextFold(paragraph, term)
if ok {
context = append(context, extracted...)
matched = true
Expand Down Expand Up @@ -62,34 +60,3 @@ func (book *Notebook) Search(query ...string) []Result {
func extractParagraphs(content string) []string {
return strings.Split(content, "\n\n")
}

func matches(content string, query string) bool {
tokens := strings.Fields(content)
for _, token := range tokens {
if len(query) > 3 && strings.HasPrefix(token, query) {
return true
}

var distance int
hasUppercase := regexp.MustCompile("[A-Z]")
if hasUppercase.MatchString(query) {
distance = fuzzy.RankMatchNormalized(query, token)
} else {
distance = fuzzy.RankMatchNormalizedFold(query, token)
}

if distance == -1 {
continue
}

if distance == 0 {
return true
}

distancePercent := (float64(distance) / float64(len(token))) * 100
if distancePercent < 50 {
return true
}
}
return false
}
51 changes: 42 additions & 9 deletions util/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@ import (
"strings"
)

var mdListRegex = regexp.MustCompile(`^(\s*)(?:\*|\+|-|\d+[.)])\s+`);
var mdListEntryRegex = regexp.MustCompile(`^(\s*)(?:(?:\*|\+|-|\d+[.)])\s+)?([^\n]+)`);
var mdListRegex = regexp.MustCompile(`^(\s*)(?:\*|\+|-|\d+[.)])\s+`)

var mdListEntryRegex = regexp.MustCompile(`^(\s*)(?:(?:\*|\+|-|\d+[.)])\s+)?([^\n]+)`)

// cache contextRegex by input phrase
var contextRegexCache = make(map[string]*regexp.Regexp)

func Context(s string, phrase string) ([]string, bool) {
type ContextMatchFunc func(s string, phrase string) bool

func context(s string, phrase string, matchFunc ContextMatchFunc) ([]string, bool) {
contextRegex := contextRegex(phrase)
matches := contextRegex.FindAllStringSubmatch(s, -1)
if matches == nil {
Expand All @@ -24,13 +27,27 @@ func Context(s string, phrase string) ([]string, bool) {
match := strings.TrimSpace(match[0])
if isMarkdownList(match) {
for _, line := range strings.Split(match, "\n") {
if strings.Contains(line, phrase) {
if matchFunc(line, phrase) {
context := mdListEntryRegex.FindStringSubmatch(line)
contexts = append(contexts, context[2])
}
}
continue
}

if isMarkdownTable(match) {
for _, row := range strings.Split(match, "\n") {
if matchFunc(row, phrase) {
for _, cell := range strings.Split(row, "|") {
if matchFunc(cell, phrase) {
contexts = append(contexts, strings.TrimSpace(cell))
}
}
}
}
continue
}

contexts = append(contexts, match)
}

Expand All @@ -41,11 +58,27 @@ func isMarkdownList(line string) bool {
return mdListRegex.MatchString(line)
}

func isMarkdownTable(line string) bool {
return strings.HasPrefix(line, "|")
}

func contextRegex(phrase string) *regexp.Regexp {
if contextRegexCache[phrase] == nil {
input := regexp.QuoteMeta(phrase)
contextRegexCache[phrase] = regexp.MustCompile(`(?i)(?:[^\n]\n?)*` + input + `(?:[^\n]\n?)*`)
}
if contextRegexCache[phrase] == nil {
input := regexp.QuoteMeta(phrase)
contextRegexCache[phrase] = regexp.MustCompile(`(?i)(?:[^\n]\n?)*` + input + `(?:[^\n]\n?)*`)
}

return contextRegexCache[phrase]
}

func Context(s string, phrase string) ([]string, bool) {
return context(s, phrase, func(s string, t string) bool {
return strings.Contains(s, phrase)
})
}

return contextRegexCache[phrase]
func ContextFold(s string, phrase string) ([]string, bool) {
return context(s, phrase, func(s string, t string) bool {
return strings.Contains(strings.ToLower(s), strings.ToLower(phrase))
})
}
29 changes: 29 additions & 0 deletions util/context_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ func TestContext(t *testing.T) {

{" * This is a list entry\n* list about nothing\nor maybe something", "list", []string{"This is a list entry", "list about nothing"}},
{"Example 1\n\nanother example\n\nand another", "another", []string{"another example", "and another"}},

{"|Column A|Column B|\n|------|------|\n|Value foo|Value bar|", "foo", []string{"Value foo"}},
{"| Column A | Column B |\n| ------ | ------ |\n| Value foo | Value bar |", "foo", []string{"Value foo"}},
}

for _, test := range tests {
Expand All @@ -41,6 +44,32 @@ func TestContext(t *testing.T) {
t.Fatalf("expected '%s' but was '%s'", test.want, got)
}
})
}
}

func TestContextFold(t *testing.T) {

tests := []struct {
source string
phrase string
want []string
}{
{"|Column A|Column B|\n|------|------|\n|Value foo|Value bar|", "foo", []string{"Value foo"}},
{"| Column A | Column B |\n| ------ | ------ |\n| Value foo | Value bar |", "foo", []string{"Value foo"}},

{"|Column A|Column B|\n|------|------|\n|Value Foo|Value Bar|", "foo", []string{"Value Foo"}},
{"| Column A | Column B |\n| ------ | ------ |\n| Value Foo | Value Bar |", "foo", []string{"Value Foo"}},
}

for _, test := range tests {
t.Run(fmt.Sprintf("ContextFold('%v', '%v')", test.source, test.phrase), func(t *testing.T) {
got, ok := ContextFold(test.source, test.phrase)
if !ok {
t.Fatalf("Expected ok but was not ok")
}
if !reflect.DeepEqual(got, test.want) {
t.Fatalf("expected '%s' but was '%s'", test.want, got)
}
})
}
}
42 changes: 42 additions & 0 deletions util/paragraph.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package util

import (
"github.com/lithammer/fuzzysearch/fuzzy"
"regexp"
"strings"
)

var noiseRegex = regexp.MustCompile(`[^\s\w#.]`)

func ParagraphMatches(content string, query string) bool {
content = noiseRegex.ReplaceAllString(content, " ")
tokens := strings.Fields(content)
for _, token := range tokens {
if len(query) > 3 && strings.HasPrefix(token, query) {
return true
}

var distance int
hasUppercase := regexp.MustCompile("[A-Z]")
if hasUppercase.MatchString(query) {
distance = fuzzy.RankMatchNormalized(query, token)
} else {
distance = fuzzy.RankMatchNormalizedFold(query, token)
}

if distance == -1 {
continue
}

if distance == 0 {
return true
}

distancePercent := (float64(distance) / float64(len(token))) * 100
thresholdPercent := 50.0
if distancePercent < thresholdPercent || (distancePercent == thresholdPercent && len(token) == 2) {
return true
}
}
return false
}
35 changes: 35 additions & 0 deletions util/paragraph_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package util

import (
"fmt"
"testing"
)

func TestParagraphMatches(t *testing.T) {
tests := []struct {
query string
content string
want bool
}{
{"string", "SomeString", true},
{"string", "Some String", true},
{"c", "C#", true},

{"Foo", "|Column A|Column B|\n|------|------|\n|Value Foo|Value Bar|", true},
{"Foo", "| Column A | Column B |\n| ------ | ------ |\n| Value Foo | Value Bar |", true},

{"foo", "|Column A|Column B|\n|------|------|\n|Value Foo|Value Bar|", true},
{"foo", "| Column A | Column B |\n| ------ | ------ |\n| Value Foo | Value Bar |", true},

{"booom", "| Thing | Booom |\n", true},
}

for _, test := range tests {
t.Run(fmt.Sprintf("ParagraphMatches('%v', '%v')", test.content, test.query), func(t *testing.T) {
got := ParagraphMatches(test.content, test.query)
if got != test.want {
t.Fatalf("expected %t but was %t", test.want, got)
}
})
}
}

0 comments on commit b71c1a2

Please sign in to comment.