diff --git a/pom.xml b/pom.xml index 137781c..53b4e21 100644 --- a/pom.xml +++ b/pom.xml @@ -10,6 +10,7 @@ app domain shared + search pom diff --git a/search/pom.xml b/search/pom.xml new file mode 100644 index 0000000..4fb947f --- /dev/null +++ b/search/pom.xml @@ -0,0 +1,50 @@ + + + + parent + be.simplenotes + 1.0-SNAPSHOT + + 4.0.0 + + search + + + 8.5.2 + + + + + be.simplenotes + domain + 1.0-SNAPSHOT + + + org.apache.lucene + lucene-core + ${lucene.version} + + + org.apache.lucene + lucene-analyzers-common + ${lucene.version} + + + org.apache.lucene + lucene-queryparser + ${lucene.version} + + + + be.simplenotes + shared + 1.0-SNAPSHOT + test-jar + test + + + + + diff --git a/search/src/main/kotlin/Constants.kt b/search/src/main/kotlin/Constants.kt new file mode 100644 index 0000000..c81f32a --- /dev/null +++ b/search/src/main/kotlin/Constants.kt @@ -0,0 +1,7 @@ +package be.simplenotes.search + +internal const val uuidField = "uuid" +internal const val titleField = "title" +internal const val tagsField = "tags" +internal const val contentField = "content" +internal const val updatedAtField = "updatedAt" diff --git a/search/src/main/kotlin/Extractors.kt b/search/src/main/kotlin/Extractors.kt new file mode 100644 index 0000000..ae1db61 --- /dev/null +++ b/search/src/main/kotlin/Extractors.kt @@ -0,0 +1,35 @@ +package be.simplenotes.search + +import be.simplenotes.domain.model.PersistedNote +import be.simplenotes.domain.model.PersistedNoteMetadata +import org.apache.lucene.document.Document +import org.apache.lucene.document.Field +import org.apache.lucene.document.StringField +import org.apache.lucene.document.TextField +import org.apache.lucene.search.IndexSearcher +import org.apache.lucene.search.TopDocs + +internal fun PersistedNote.toDocument(): Document { + val note = this + return Document().apply { + // non searchable fields + add(StringField(uuidField, UuidFieldConverter.toDoc(note.uuid), Field.Store.YES)) + add(StringField(updatedAtField, LocalDateTimeFieldConverter.toDoc(note.updatedAt), Field.Store.YES)) + + // searchable fields + add(TextField(titleField, note.meta.title, Field.Store.YES)) + add(TextField(tagsField, TagsFieldConverter.toDoc(note.meta.tags), Field.Store.YES)) + add(TextField(contentField, note.html, Field.Store.YES)) + } +} + +internal fun TopDocs.toResults(searcher: IndexSearcher) = scoreDocs.map { + searcher.doc(it.doc).let { doc -> + PersistedNoteMetadata( + title = doc.get(titleField), + uuid = UuidFieldConverter.fromDoc(doc.get(uuidField)), + updatedAt = LocalDateTimeFieldConverter.fromDoc(doc.get(updatedAtField)), + tags = TagsFieldConverter.fromDoc(doc.get(tagsField)) + ) + } +} diff --git a/search/src/main/kotlin/FieldConverters.kt b/search/src/main/kotlin/FieldConverters.kt new file mode 100644 index 0000000..34e3afa --- /dev/null +++ b/search/src/main/kotlin/FieldConverters.kt @@ -0,0 +1,26 @@ +package be.simplenotes.search + +import java.time.LocalDateTime +import java.time.format.DateTimeFormatter +import java.util.* + +internal interface FieldConverter { + fun toDoc(value: T): String + fun fromDoc(value: String): T +} + +internal object LocalDateTimeFieldConverter : FieldConverter { + private val formatter = DateTimeFormatter.ISO_DATE_TIME + override fun toDoc(value: LocalDateTime): String = formatter.format(value) + override fun fromDoc(value: String): LocalDateTime = LocalDateTime.parse(value, formatter) +} + +internal object UuidFieldConverter : FieldConverter { + override fun toDoc(value: UUID): String = value.toString() + override fun fromDoc(value: String): UUID = UUID.fromString(value) +} + +internal object TagsFieldConverter : FieldConverter> { + override fun toDoc(value: List): String = value.joinToString(" ") + override fun fromDoc(value: String): List = value.split(" ") +} diff --git a/search/src/main/kotlin/NoteSearcher.kt b/search/src/main/kotlin/NoteSearcher.kt new file mode 100644 index 0000000..5c6126a --- /dev/null +++ b/search/src/main/kotlin/NoteSearcher.kt @@ -0,0 +1,123 @@ +package be.simplenotes.search + +import be.simplenotes.domain.model.PersistedNote +import be.simplenotes.domain.model.PersistedNoteMetadata +import org.apache.lucene.analysis.standard.StandardAnalyzer +import org.apache.lucene.index.* +import org.apache.lucene.search.* +import org.apache.lucene.store.Directory +import org.apache.lucene.store.FSDirectory +import org.slf4j.LoggerFactory +import java.io.File +import java.io.IOException +import java.nio.file.FileVisitResult +import java.nio.file.Files +import java.nio.file.Path +import java.nio.file.SimpleFileVisitor +import java.nio.file.attribute.BasicFileAttributes +import java.util.* + +data class SearchTerms(val title: String?, val tag: String?, val content: String?) + +class NoteSearcher(basePath: Path = Path.of("/tmp", "lucene")) { + private val baseFile = basePath.toFile() + + private val logger = LoggerFactory.getLogger(javaClass) + + // region utils + private fun getDirectory(userId: Int): Directory { + val index = File(baseFile, userId.toString()).toPath() + return FSDirectory.open(index) + } + + private fun getIndexSearcher(userId: Int): IndexSearcher { + val directory = getDirectory(userId) + val reader: IndexReader = DirectoryReader.open(directory) + return IndexSearcher(reader) + } + // endregion + + fun indexNote(userId: Int, note: PersistedNote) { + logger.debug("Indexing note ${note.uuid} for user $userId") + + val dir = getDirectory(userId) + val config = IndexWriterConfig(StandardAnalyzer()) + val writer = IndexWriter(dir, config) + val doc = note.toDocument() + + with(writer) { + addDocument(doc) + commit() + close() + } + } + + fun deleteIndex(userId: Int, uuid: UUID) { + logger.debug("Deleting indexing $uuid for user $userId") + + val dir = getDirectory(userId) + val config = IndexWriterConfig(StandardAnalyzer()) + val writer = IndexWriter(dir, config) + + with(writer) { + deleteDocuments(TermQuery(Term(uuidField, UuidFieldConverter.toDoc(uuid)))) + commit() + close() + } + } + + fun updateIndex(userId: Int, note: PersistedNote) { + logger.debug("Updating note ${note.uuid} for user $userId") + deleteIndex(userId, note.uuid) + indexNote(userId, note) + } + + fun search(userId: Int, terms: SearchTerms): List { + val searcher = getIndexSearcher(userId) + + val builder = BooleanQuery.Builder() + + terms.title?.let { + val titleQuery = FuzzyQuery(Term(titleField, it)) + builder.add(BooleanClause(titleQuery, BooleanClause.Occur.SHOULD)) + } + + terms.tag?.let { + val tagsQuery = FuzzyQuery(Term(tagsField, it)) + builder.add(BooleanClause(tagsQuery, BooleanClause.Occur.SHOULD)) + } + + terms.content?.let { + val contentQuery = FuzzyQuery(Term(contentField, it)) + builder.add(BooleanClause(contentQuery, BooleanClause.Occur.SHOULD)) + } + + val query = builder.build() + logger.debug("Searching: $query") + + val topDocs = searcher.search(query, 10) + return topDocs.toResults(searcher) + } + + fun dropIndex(userId: Int) { + val index = File(baseFile, userId.toString()).toPath() + try { + Files.walkFileTree( + index, + object : SimpleFileVisitor() { + override fun visitFile(file: Path, attrs: BasicFileAttributes?): FileVisitResult { + Files.delete(file) + return FileVisitResult.CONTINUE + } + + override fun postVisitDirectory(dir: Path, exc: IOException?): FileVisitResult { + Files.delete(dir) + return FileVisitResult.CONTINUE + } + } + ) + } catch (e: IOException) { + // This is fine + } + } +} diff --git a/search/src/test/kotlin/NoteSearcherTest.kt b/search/src/test/kotlin/NoteSearcherTest.kt new file mode 100644 index 0000000..357f1d5 --- /dev/null +++ b/search/src/test/kotlin/NoteSearcherTest.kt @@ -0,0 +1,162 @@ +package be.simplenotes.search + +import be.simplenotes.domain.model.NoteMetadata +import be.simplenotes.domain.model.PersistedNote +import be.simplenotes.domain.model.PersistedNoteMetadata +import org.assertj.core.api.Assertions.assertThat +import org.intellij.lang.annotations.Language +import org.junit.jupiter.api.AfterAll +import org.junit.jupiter.api.BeforeEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.parallel.ResourceLock +import java.time.LocalDateTime +import java.util.* + +@ResourceLock("lucene") +internal class NoteSearcherTest { + + // region setup + private val searcher = NoteSearcher() + + private fun index( + title: String, + tags: List = emptyList(), + content: String = "", + uuid: UUID = UUID.randomUUID(), + ): PersistedNote { + val note = PersistedNote(NoteMetadata(title, tags), markdown = "", content, LocalDateTime.now(), uuid) + searcher.indexNote(1, note) + return note + } + + private fun search( + title: String? = null, + tag: String? = null, + content: String? = null, + ): List = searcher.search(1, SearchTerms(title, tag, content)) + + @BeforeEach + @AfterAll + fun dropIndexes() { + searcher.dropIndex(1) + } + // endregion + + @Test + fun `exact title search`() { + index("first") + index("second") + index("flip") + + assertThat(search("first")) + .hasSizeGreaterThanOrEqualTo(1) + .anyMatch { it.title == "first" } + + assertThat(search("nothing")).isEmpty() + } + + @Test + fun `fuzzy title search`() { + index("first") + index("second") + index("flip") + + assertThat(search("firt")) + .hasSizeGreaterThanOrEqualTo(1) + .anyMatch { it.title == "first" } + + assertThat(search("nothing")).isEmpty() + } + + @Test + fun `exact tags search`() { + index("first", tags = listOf("example", "flamingo")) + index("second", tags = listOf("yes")) + index("second") + + assertThat(search(tag = "example")) + .hasSize(1) + .anyMatch { it.title == "first" } + } + + @Test + fun `exact content search`() { + @Language("html") + val content = + """ +
+

Apache Lucene Core

+

Apache LuceneTM is a + high-performance, full-featured text search engine library written entirely in Java. + It is a technology suitable for nearly any application that requires full-text search, + especially cross-platform.

+

Apache Lucene is an open source project available for free download. Please use the + links on the right to access Lucene.

+

LuceneTM Features

+

Lucene offers powerful features through a simple API:

+

Scalable, High-Performance Indexing

+
    +
  • over 150GB/hour on modern hardware
  • +
  • small RAM requirements -- only 1MB heap
  • +
  • incremental indexing as fast as batch indexing
  • +
  • index size roughly 20-30% the size of text indexed
  • +
+ """.trimIndent() + + index("first", content = content) + + assertThat(search(content = "fast")) + .hasSize(1) + .anyMatch { it.title == "first" } + + @Suppress("SpellCheckingInspection") + assertThat(search(content = "preformance")) // <- note the error + .hasSize(1) + .anyMatch { it.title == "first" } + } + + @Test + fun `combined search`() { + @Language("html") + val content = + """ +
+

Apache Lucene Core

+

Apache LuceneTM is a + high-performance, full-featured text search engine library written entirely in Java. + It is a technology suitable for nearly any application that requires full-text search, + especially cross-platform.

+

Apache Lucene is an open source project available for free download. Please use the + links on the right to access Lucene.

+

LuceneTM Features

+

Lucene offers powerful features through a simple API:

+

Scalable, High-Performance Indexing

+
    +
  • over 150GB/hour on modern hardware
  • +
  • small RAM requirements -- only 1MB heap
  • +
  • incremental indexing as fast as batch indexing
  • +
  • index size roughly 20-30% the size of text indexed
  • +
+ """.trimIndent() + + index("first", content = content, tags = listOf("abc")) + + assertThat(search(title = "fir", tag = "abc", content = "20")) + .hasSize(1) + } + + @Test + fun `delete index`() { + val uuid = index("first").uuid + searcher.deleteIndex(1, uuid) + assertThat(search("first")).isEmpty() + } + + @Test + fun `update index`() { + val note = index("first") + searcher.updateIndex(1, note.copy(meta = note.meta.copy(title = "new"))) + assertThat(search("first")).isEmpty() + assertThat(search("new")).hasSize(1) + } +}