Add search module

This commit is contained in:
Hubert Van De Walle 2020-08-19 05:08:27 +02:00
parent 88b6eb56ae
commit 3861fb6b97
7 changed files with 404 additions and 0 deletions

View File

@ -10,6 +10,7 @@
<module>app</module>
<module>domain</module>
<module>shared</module>
<module>search</module>
</modules>
<packaging>pom</packaging>

50
search/pom.xml Normal file
View File

@ -0,0 +1,50 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>parent</artifactId>
<groupId>be.simplenotes</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>search</artifactId>
<properties>
<lucene.version>8.5.2</lucene.version>
</properties>
<dependencies>
<dependency>
<groupId>be.simplenotes</groupId>
<artifactId>domain</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>be.simplenotes</groupId>
<artifactId>shared</artifactId>
<version>1.0-SNAPSHOT</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,7 @@
package be.simplenotes.search
internal const val uuidField = "uuid"
internal const val titleField = "title"
internal const val tagsField = "tags"
internal const val contentField = "content"
internal const val updatedAtField = "updatedAt"

View File

@ -0,0 +1,35 @@
package be.simplenotes.search
import be.simplenotes.domain.model.PersistedNote
import be.simplenotes.domain.model.PersistedNoteMetadata
import org.apache.lucene.document.Document
import org.apache.lucene.document.Field
import org.apache.lucene.document.StringField
import org.apache.lucene.document.TextField
import org.apache.lucene.search.IndexSearcher
import org.apache.lucene.search.TopDocs
internal fun PersistedNote.toDocument(): Document {
val note = this
return Document().apply {
// non searchable fields
add(StringField(uuidField, UuidFieldConverter.toDoc(note.uuid), Field.Store.YES))
add(StringField(updatedAtField, LocalDateTimeFieldConverter.toDoc(note.updatedAt), Field.Store.YES))
// searchable fields
add(TextField(titleField, note.meta.title, Field.Store.YES))
add(TextField(tagsField, TagsFieldConverter.toDoc(note.meta.tags), Field.Store.YES))
add(TextField(contentField, note.html, Field.Store.YES))
}
}
internal fun TopDocs.toResults(searcher: IndexSearcher) = scoreDocs.map {
searcher.doc(it.doc).let { doc ->
PersistedNoteMetadata(
title = doc.get(titleField),
uuid = UuidFieldConverter.fromDoc(doc.get(uuidField)),
updatedAt = LocalDateTimeFieldConverter.fromDoc(doc.get(updatedAtField)),
tags = TagsFieldConverter.fromDoc(doc.get(tagsField))
)
}
}

View File

@ -0,0 +1,26 @@
package be.simplenotes.search
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
import java.util.*
internal interface FieldConverter<T> {
fun toDoc(value: T): String
fun fromDoc(value: String): T
}
internal object LocalDateTimeFieldConverter : FieldConverter<LocalDateTime> {
private val formatter = DateTimeFormatter.ISO_DATE_TIME
override fun toDoc(value: LocalDateTime): String = formatter.format(value)
override fun fromDoc(value: String): LocalDateTime = LocalDateTime.parse(value, formatter)
}
internal object UuidFieldConverter : FieldConverter<UUID> {
override fun toDoc(value: UUID): String = value.toString()
override fun fromDoc(value: String): UUID = UUID.fromString(value)
}
internal object TagsFieldConverter : FieldConverter<List<String>> {
override fun toDoc(value: List<String>): String = value.joinToString(" ")
override fun fromDoc(value: String): List<String> = value.split(" ")
}

View File

@ -0,0 +1,123 @@
package be.simplenotes.search
import be.simplenotes.domain.model.PersistedNote
import be.simplenotes.domain.model.PersistedNoteMetadata
import org.apache.lucene.analysis.standard.StandardAnalyzer
import org.apache.lucene.index.*
import org.apache.lucene.search.*
import org.apache.lucene.store.Directory
import org.apache.lucene.store.FSDirectory
import org.slf4j.LoggerFactory
import java.io.File
import java.io.IOException
import java.nio.file.FileVisitResult
import java.nio.file.Files
import java.nio.file.Path
import java.nio.file.SimpleFileVisitor
import java.nio.file.attribute.BasicFileAttributes
import java.util.*
data class SearchTerms(val title: String?, val tag: String?, val content: String?)
class NoteSearcher(basePath: Path = Path.of("/tmp", "lucene")) {
private val baseFile = basePath.toFile()
private val logger = LoggerFactory.getLogger(javaClass)
// region utils
private fun getDirectory(userId: Int): Directory {
val index = File(baseFile, userId.toString()).toPath()
return FSDirectory.open(index)
}
private fun getIndexSearcher(userId: Int): IndexSearcher {
val directory = getDirectory(userId)
val reader: IndexReader = DirectoryReader.open(directory)
return IndexSearcher(reader)
}
// endregion
fun indexNote(userId: Int, note: PersistedNote) {
logger.debug("Indexing note ${note.uuid} for user $userId")
val dir = getDirectory(userId)
val config = IndexWriterConfig(StandardAnalyzer())
val writer = IndexWriter(dir, config)
val doc = note.toDocument()
with(writer) {
addDocument(doc)
commit()
close()
}
}
fun deleteIndex(userId: Int, uuid: UUID) {
logger.debug("Deleting indexing $uuid for user $userId")
val dir = getDirectory(userId)
val config = IndexWriterConfig(StandardAnalyzer())
val writer = IndexWriter(dir, config)
with(writer) {
deleteDocuments(TermQuery(Term(uuidField, UuidFieldConverter.toDoc(uuid))))
commit()
close()
}
}
fun updateIndex(userId: Int, note: PersistedNote) {
logger.debug("Updating note ${note.uuid} for user $userId")
deleteIndex(userId, note.uuid)
indexNote(userId, note)
}
fun search(userId: Int, terms: SearchTerms): List<PersistedNoteMetadata> {
val searcher = getIndexSearcher(userId)
val builder = BooleanQuery.Builder()
terms.title?.let {
val titleQuery = FuzzyQuery(Term(titleField, it))
builder.add(BooleanClause(titleQuery, BooleanClause.Occur.SHOULD))
}
terms.tag?.let {
val tagsQuery = FuzzyQuery(Term(tagsField, it))
builder.add(BooleanClause(tagsQuery, BooleanClause.Occur.SHOULD))
}
terms.content?.let {
val contentQuery = FuzzyQuery(Term(contentField, it))
builder.add(BooleanClause(contentQuery, BooleanClause.Occur.SHOULD))
}
val query = builder.build()
logger.debug("Searching: $query")
val topDocs = searcher.search(query, 10)
return topDocs.toResults(searcher)
}
fun dropIndex(userId: Int) {
val index = File(baseFile, userId.toString()).toPath()
try {
Files.walkFileTree(
index,
object : SimpleFileVisitor<Path>() {
override fun visitFile(file: Path, attrs: BasicFileAttributes?): FileVisitResult {
Files.delete(file)
return FileVisitResult.CONTINUE
}
override fun postVisitDirectory(dir: Path, exc: IOException?): FileVisitResult {
Files.delete(dir)
return FileVisitResult.CONTINUE
}
}
)
} catch (e: IOException) {
// This is fine
}
}
}

View File

@ -0,0 +1,162 @@
package be.simplenotes.search
import be.simplenotes.domain.model.NoteMetadata
import be.simplenotes.domain.model.PersistedNote
import be.simplenotes.domain.model.PersistedNoteMetadata
import org.assertj.core.api.Assertions.assertThat
import org.intellij.lang.annotations.Language
import org.junit.jupiter.api.AfterAll
import org.junit.jupiter.api.BeforeEach
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.parallel.ResourceLock
import java.time.LocalDateTime
import java.util.*
@ResourceLock("lucene")
internal class NoteSearcherTest {
// region setup
private val searcher = NoteSearcher()
private fun index(
title: String,
tags: List<String> = emptyList(),
content: String = "",
uuid: UUID = UUID.randomUUID(),
): PersistedNote {
val note = PersistedNote(NoteMetadata(title, tags), markdown = "", content, LocalDateTime.now(), uuid)
searcher.indexNote(1, note)
return note
}
private fun search(
title: String? = null,
tag: String? = null,
content: String? = null,
): List<PersistedNoteMetadata> = searcher.search(1, SearchTerms(title, tag, content))
@BeforeEach
@AfterAll
fun dropIndexes() {
searcher.dropIndex(1)
}
// endregion
@Test
fun `exact title search`() {
index("first")
index("second")
index("flip")
assertThat(search("first"))
.hasSizeGreaterThanOrEqualTo(1)
.anyMatch { it.title == "first" }
assertThat(search("nothing")).isEmpty()
}
@Test
fun `fuzzy title search`() {
index("first")
index("second")
index("flip")
assertThat(search("firt"))
.hasSizeGreaterThanOrEqualTo(1)
.anyMatch { it.title == "first" }
assertThat(search("nothing")).isEmpty()
}
@Test
fun `exact tags search`() {
index("first", tags = listOf("example", "flamingo"))
index("second", tags = listOf("yes"))
index("second")
assertThat(search(tag = "example"))
.hasSize(1)
.anyMatch { it.title == "first" }
}
@Test
fun `exact content search`() {
@Language("html")
val content =
"""
<div>
<h1 class="title">Apache Lucene Core</h1>
<p>Apache Lucene<span style="vertical-align: super; font-size: xx-small">TM</span> is a
high-performance, full-featured text search engine library written entirely in Java.
It is a technology suitable for nearly any application that requires full-text search,
especially cross-platform.</p>
<p>Apache Lucene is an open source project available for free download. Please use the
links on the right to access Lucene.</p>
<h1 id="lucenetm-features">Lucene<span style="vertical-align: super; font-size: xx-small">TM</span> Features</h1>
<p>Lucene offers powerful features through a simple API:</p>
<h2 id="scalable-high-performance-indexing">Scalable, High-Performance Indexing</h2>
<ul>
<li>over <a href="http://home.apache.org/~mikemccand/lucenebench/indexing.html">150GB/hour on modern hardware</a></li>
<li>small RAM requirements -- only 1MB heap</li>
<li>incremental indexing as fast as batch indexing</li>
<li>index size roughly 20-30% the size of text indexed</li>
</ul>
""".trimIndent()
index("first", content = content)
assertThat(search(content = "fast"))
.hasSize(1)
.anyMatch { it.title == "first" }
@Suppress("SpellCheckingInspection")
assertThat(search(content = "preformance")) // <- note the error
.hasSize(1)
.anyMatch { it.title == "first" }
}
@Test
fun `combined search`() {
@Language("html")
val content =
"""
<div>
<h1 class="title">Apache Lucene Core</h1>
<p>Apache Lucene<span style="vertical-align: super; font-size: xx-small">TM</span> is a
high-performance, full-featured text search engine library written entirely in Java.
It is a technology suitable for nearly any application that requires full-text search,
especially cross-platform.</p>
<p>Apache Lucene is an open source project available for free download. Please use the
links on the right to access Lucene.</p>
<h1 id="lucenetm-features">Lucene<span style="vertical-align: super; font-size: xx-small">TM</span> Features</h1>
<p>Lucene offers powerful features through a simple API:</p>
<h2 id="scalable-high-performance-indexing">Scalable, High-Performance Indexing</h2>
<ul>
<li>over <a href="http://home.apache.org/~mikemccand/lucenebench/indexing.html">150GB/hour on modern hardware</a></li>
<li>small RAM requirements -- only 1MB heap</li>
<li>incremental indexing as fast as batch indexing</li>
<li>index size roughly 20-30% the size of text indexed</li>
</ul>
""".trimIndent()
index("first", content = content, tags = listOf("abc"))
assertThat(search(title = "fir", tag = "abc", content = "20"))
.hasSize(1)
}
@Test
fun `delete index`() {
val uuid = index("first").uuid
searcher.deleteIndex(1, uuid)
assertThat(search("first")).isEmpty()
}
@Test
fun `update index`() {
val note = index("first")
searcher.updateIndex(1, note.copy(meta = note.meta.copy(title = "new")))
assertThat(search("first")).isEmpty()
assertThat(search("new")).hasSize(1)
}
}