Add search module

2020-08-19 05:08:27 +02:00 · 2020-08-19 05:08:27 +02:00 · 3861fb6b97
commit 3861fb6b97
parent 88b6eb56ae
7 changed files with 404 additions and 0 deletions
--- a/pom.xml
+++ b/pom.xml
@ -10,6 +10,7 @@
        <module>app</module>
        <module>domain</module>
        <module>shared</module>
+        <module>search</module>
    </modules>

    <packaging>pom</packaging>
--- a/search/pom.xml
+++ b/search/pom.xml
@ -0,0 +1,50 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>parent</artifactId>
+        <groupId>be.simplenotes</groupId>
+        <version>1.0-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>search</artifactId>
+
+    <properties>
+        <lucene.version>8.5.2</lucene.version>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>be.simplenotes</groupId>
+            <artifactId>domain</artifactId>
+            <version>1.0-SNAPSHOT</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-core</artifactId>
+            <version>${lucene.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-analyzers-common</artifactId>
+            <version>${lucene.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-queryparser</artifactId>
+            <version>${lucene.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>be.simplenotes</groupId>
+            <artifactId>shared</artifactId>
+            <version>1.0-SNAPSHOT</version>
+            <type>test-jar</type>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+
+</project>
--- a/search/src/main/kotlin/Constants.kt
+++ b/search/src/main/kotlin/Constants.kt
@ -0,0 +1,7 @@
+package be.simplenotes.search
+
+internal const val uuidField = "uuid"
+internal const val titleField = "title"
+internal const val tagsField = "tags"
+internal const val contentField = "content"
+internal const val updatedAtField = "updatedAt"
--- a/search/src/main/kotlin/Extractors.kt
+++ b/search/src/main/kotlin/Extractors.kt
@ -0,0 +1,35 @@
+package be.simplenotes.search
+
+import be.simplenotes.domain.model.PersistedNote
+import be.simplenotes.domain.model.PersistedNoteMetadata
+import org.apache.lucene.document.Document
+import org.apache.lucene.document.Field
+import org.apache.lucene.document.StringField
+import org.apache.lucene.document.TextField
+import org.apache.lucene.search.IndexSearcher
+import org.apache.lucene.search.TopDocs
+
+internal fun PersistedNote.toDocument(): Document {
+    val note = this
+    return Document().apply {
+        // non searchable fields
+        add(StringField(uuidField, UuidFieldConverter.toDoc(note.uuid), Field.Store.YES))
+        add(StringField(updatedAtField, LocalDateTimeFieldConverter.toDoc(note.updatedAt), Field.Store.YES))
+
+        // searchable fields
+        add(TextField(titleField, note.meta.title, Field.Store.YES))
+        add(TextField(tagsField, TagsFieldConverter.toDoc(note.meta.tags), Field.Store.YES))
+        add(TextField(contentField, note.html, Field.Store.YES))
+    }
+}
+
+internal fun TopDocs.toResults(searcher: IndexSearcher) = scoreDocs.map {
+    searcher.doc(it.doc).let { doc ->
+        PersistedNoteMetadata(
+            title = doc.get(titleField),
+            uuid = UuidFieldConverter.fromDoc(doc.get(uuidField)),
+            updatedAt = LocalDateTimeFieldConverter.fromDoc(doc.get(updatedAtField)),
+            tags = TagsFieldConverter.fromDoc(doc.get(tagsField))
+        )
+    }
+}
--- a/search/src/main/kotlin/FieldConverters.kt
+++ b/search/src/main/kotlin/FieldConverters.kt
@ -0,0 +1,26 @@
+package be.simplenotes.search
+
+import java.time.LocalDateTime
+import java.time.format.DateTimeFormatter
+import java.util.*
+
+internal interface FieldConverter<T> {
+    fun toDoc(value: T): String
+    fun fromDoc(value: String): T
+}
+
+internal object LocalDateTimeFieldConverter : FieldConverter<LocalDateTime> {
+    private val formatter = DateTimeFormatter.ISO_DATE_TIME
+    override fun toDoc(value: LocalDateTime): String = formatter.format(value)
+    override fun fromDoc(value: String): LocalDateTime = LocalDateTime.parse(value, formatter)
+}
+
+internal object UuidFieldConverter : FieldConverter<UUID> {
+    override fun toDoc(value: UUID): String = value.toString()
+    override fun fromDoc(value: String): UUID = UUID.fromString(value)
+}
+
+internal object TagsFieldConverter : FieldConverter<List<String>> {
+    override fun toDoc(value: List<String>): String = value.joinToString(" ")
+    override fun fromDoc(value: String): List<String> = value.split(" ")
+}
--- a/search/src/main/kotlin/NoteSearcher.kt
+++ b/search/src/main/kotlin/NoteSearcher.kt
@ -0,0 +1,123 @@
+package be.simplenotes.search
+
+import be.simplenotes.domain.model.PersistedNote
+import be.simplenotes.domain.model.PersistedNoteMetadata
+import org.apache.lucene.analysis.standard.StandardAnalyzer
+import org.apache.lucene.index.*
+import org.apache.lucene.search.*
+import org.apache.lucene.store.Directory
+import org.apache.lucene.store.FSDirectory
+import org.slf4j.LoggerFactory
+import java.io.File
+import java.io.IOException
+import java.nio.file.FileVisitResult
+import java.nio.file.Files
+import java.nio.file.Path
+import java.nio.file.SimpleFileVisitor
+import java.nio.file.attribute.BasicFileAttributes
+import java.util.*
+
+data class SearchTerms(val title: String?, val tag: String?, val content: String?)
+
+class NoteSearcher(basePath: Path = Path.of("/tmp", "lucene")) {
+    private val baseFile = basePath.toFile()
+
+    private val logger = LoggerFactory.getLogger(javaClass)
+
+    // region utils
+    private fun getDirectory(userId: Int): Directory {
+        val index = File(baseFile, userId.toString()).toPath()
+        return FSDirectory.open(index)
+    }
+
+    private fun getIndexSearcher(userId: Int): IndexSearcher {
+        val directory = getDirectory(userId)
+        val reader: IndexReader = DirectoryReader.open(directory)
+        return IndexSearcher(reader)
+    }
+    // endregion
+
+    fun indexNote(userId: Int, note: PersistedNote) {
+        logger.debug("Indexing note ${note.uuid} for user $userId")
+
+        val dir = getDirectory(userId)
+        val config = IndexWriterConfig(StandardAnalyzer())
+        val writer = IndexWriter(dir, config)
+        val doc = note.toDocument()
+
+        with(writer) {
+            addDocument(doc)
+            commit()
+            close()
+        }
+    }
+
+    fun deleteIndex(userId: Int, uuid: UUID) {
+        logger.debug("Deleting indexing $uuid for user $userId")
+
+        val dir = getDirectory(userId)
+        val config = IndexWriterConfig(StandardAnalyzer())
+        val writer = IndexWriter(dir, config)
+
+        with(writer) {
+            deleteDocuments(TermQuery(Term(uuidField, UuidFieldConverter.toDoc(uuid))))
+            commit()
+            close()
+        }
+    }
+
+    fun updateIndex(userId: Int, note: PersistedNote) {
+        logger.debug("Updating note ${note.uuid} for user $userId")
+        deleteIndex(userId, note.uuid)
+        indexNote(userId, note)
+    }
+
+    fun search(userId: Int, terms: SearchTerms): List<PersistedNoteMetadata> {
+        val searcher = getIndexSearcher(userId)
+
+        val builder = BooleanQuery.Builder()
+
+        terms.title?.let {
+            val titleQuery = FuzzyQuery(Term(titleField, it))
+            builder.add(BooleanClause(titleQuery, BooleanClause.Occur.SHOULD))
+        }
+
+        terms.tag?.let {
+            val tagsQuery = FuzzyQuery(Term(tagsField, it))
+            builder.add(BooleanClause(tagsQuery, BooleanClause.Occur.SHOULD))
+        }
+
+        terms.content?.let {
+            val contentQuery = FuzzyQuery(Term(contentField, it))
+            builder.add(BooleanClause(contentQuery, BooleanClause.Occur.SHOULD))
+        }
+
+        val query = builder.build()
+        logger.debug("Searching: $query")
+
+        val topDocs = searcher.search(query, 10)
+        return topDocs.toResults(searcher)
+    }
+
+    fun dropIndex(userId: Int) {
+        val index = File(baseFile, userId.toString()).toPath()
+        try {
+            Files.walkFileTree(
+                index,
+                object : SimpleFileVisitor<Path>() {
+                    override fun visitFile(file: Path, attrs: BasicFileAttributes?): FileVisitResult {
+                        Files.delete(file)
+                        return FileVisitResult.CONTINUE
+                    }
+
+                    override fun postVisitDirectory(dir: Path, exc: IOException?): FileVisitResult {
+                        Files.delete(dir)
+                        return FileVisitResult.CONTINUE
+                    }
+                }
+            )
+        } catch (e: IOException) {
+            // This is fine
+        }
+    }
+}
--- a/search/src/test/kotlin/NoteSearcherTest.kt
+++ b/search/src/test/kotlin/NoteSearcherTest.kt
@ -0,0 +1,162 @@
+package be.simplenotes.search
+
+import be.simplenotes.domain.model.NoteMetadata
+import be.simplenotes.domain.model.PersistedNote
+import be.simplenotes.domain.model.PersistedNoteMetadata
+import org.assertj.core.api.Assertions.assertThat
+import org.intellij.lang.annotations.Language
+import org.junit.jupiter.api.AfterAll
+import org.junit.jupiter.api.BeforeEach
+import org.junit.jupiter.api.Test
+import org.junit.jupiter.api.parallel.ResourceLock
+import java.time.LocalDateTime
+import java.util.*
+
+@ResourceLock("lucene")
+internal class NoteSearcherTest {
+
+    // region setup
+    private val searcher = NoteSearcher()
+
+    private fun index(
+        title: String,
+        tags: List<String> = emptyList(),
+        content: String = "",
+        uuid: UUID = UUID.randomUUID(),
+    ): PersistedNote {
+        val note = PersistedNote(NoteMetadata(title, tags), markdown = "", content, LocalDateTime.now(), uuid)
+        searcher.indexNote(1, note)
+        return note
+    }
+
+    private fun search(
+        title: String? = null,
+        tag: String? = null,
+        content: String? = null,
+    ): List<PersistedNoteMetadata> = searcher.search(1, SearchTerms(title, tag, content))
+
+    @BeforeEach
+    @AfterAll
+    fun dropIndexes() {
+        searcher.dropIndex(1)
+    }
+    // endregion
+
+    @Test
+    fun `exact title search`() {
+        index("first")
+        index("second")
+        index("flip")
+
+        assertThat(search("first"))
+            .hasSizeGreaterThanOrEqualTo(1)
+            .anyMatch { it.title == "first" }
+
+        assertThat(search("nothing")).isEmpty()
+    }
+
+    @Test
+    fun `fuzzy title search`() {
+        index("first")
+        index("second")
+        index("flip")
+
+        assertThat(search("firt"))
+            .hasSizeGreaterThanOrEqualTo(1)
+            .anyMatch { it.title == "first" }
+
+        assertThat(search("nothing")).isEmpty()
+    }
+
+    @Test
+    fun `exact tags search`() {
+        index("first", tags = listOf("example", "flamingo"))
+        index("second", tags = listOf("yes"))
+        index("second")
+
+        assertThat(search(tag = "example"))
+            .hasSize(1)
+            .anyMatch { it.title == "first" }
+    }
+
+    @Test
+    fun `exact content search`() {
+        @Language("html")
+        val content =
+            """
+            <div>
+              <h1 class="title">Apache Lucene Core</h1>
+              <p>Apache Lucene<span style="vertical-align: super; font-size: xx-small">TM</span> is a
+            high-performance, full-featured text search engine library written entirely in Java.
+            It is a technology suitable for nearly any application that requires full-text search,
+            especially cross-platform.</p>
+            <p>Apache Lucene is an open source project available for free download. Please use the
+            links on the right to access Lucene.</p>
+            <h1 id="lucenetm-features">Lucene<span style="vertical-align: super; font-size: xx-small">TM</span> Features</h1>
+            <p>Lucene offers powerful features through a simple API:</p>
+            <h2 id="scalable-high-performance-indexing">Scalable, High-Performance Indexing</h2>
+            <ul>
+            <li>over <a href="http://home.apache.org/~mikemccand/lucenebench/indexing.html">150GB/hour on modern hardware</a></li>
+            <li>small RAM requirements -- only 1MB heap</li>
+            <li>incremental indexing as fast as batch indexing</li>
+            <li>index size roughly 20-30% the size of text indexed</li>
+            </ul>
+            """.trimIndent()
+
+        index("first", content = content)
+
+        assertThat(search(content = "fast"))
+            .hasSize(1)
+            .anyMatch { it.title == "first" }
+
+        @Suppress("SpellCheckingInspection")
+        assertThat(search(content = "preformance")) // <- note the error
+            .hasSize(1)
+            .anyMatch { it.title == "first" }
+    }
+
+    @Test
+    fun `combined search`() {
+        @Language("html")
+        val content =
+            """
+            <div>
+              <h1 class="title">Apache Lucene Core</h1>
+              <p>Apache Lucene<span style="vertical-align: super; font-size: xx-small">TM</span> is a
+            high-performance, full-featured text search engine library written entirely in Java.
+            It is a technology suitable for nearly any application that requires full-text search,
+            especially cross-platform.</p>
+            <p>Apache Lucene is an open source project available for free download. Please use the
+            links on the right to access Lucene.</p>
+            <h1 id="lucenetm-features">Lucene<span style="vertical-align: super; font-size: xx-small">TM</span> Features</h1>
+            <p>Lucene offers powerful features through a simple API:</p>
+            <h2 id="scalable-high-performance-indexing">Scalable, High-Performance Indexing</h2>
+            <ul>
+            <li>over <a href="http://home.apache.org/~mikemccand/lucenebench/indexing.html">150GB/hour on modern hardware</a></li>
+            <li>small RAM requirements -- only 1MB heap</li>
+            <li>incremental indexing as fast as batch indexing</li>
+            <li>index size roughly 20-30% the size of text indexed</li>
+            </ul>
+            """.trimIndent()
+
+        index("first", content = content, tags = listOf("abc"))
+
+        assertThat(search(title = "fir", tag = "abc", content = "20"))
+            .hasSize(1)
+    }
+
+    @Test
+    fun `delete index`() {
+        val uuid = index("first").uuid
+        searcher.deleteIndex(1, uuid)
+        assertThat(search("first")).isEmpty()
+    }
+
+    @Test
+    fun `update index`() {
+        val note = index("first")
+        searcher.updateIndex(1, note.copy(meta = note.meta.copy(title = "new")))
+        assertThat(search("first")).isEmpty()
+        assertThat(search("new")).hasSize(1)
+    }
+}