Compare commits

..

2 Commits

Author SHA1 Message Date
lm41
66491c27fc Make emoji history scrollable 2025-07-26 13:46:31 +02:00
Lars Mühlbauer
c55a87862f Remove enforcement for navigationBarContrast (#2987)
* Remove enforcement for navigationBarContrast

* Use luminance for color contrast detection

* Always use light icons when background image is present
2025-07-22 01:10:29 +02:00
27 changed files with 173 additions and 4099 deletions

View File

@@ -25,7 +25,6 @@ import androidx.compose.foundation.gestures.detectTapGestures
import androidx.compose.foundation.layout.Box
import androidx.compose.foundation.layout.Column
import androidx.compose.foundation.layout.ExperimentalLayoutApi
import androidx.compose.foundation.layout.FlowRow
import androidx.compose.foundation.layout.aspectRatio
import androidx.compose.foundation.layout.fillMaxSize
import androidx.compose.foundation.layout.fillMaxWidth
@@ -39,6 +38,8 @@ import androidx.compose.foundation.lazy.grid.GridCells
import androidx.compose.foundation.lazy.grid.LazyVerticalGrid
import androidx.compose.foundation.lazy.grid.items
import androidx.compose.foundation.lazy.grid.rememberLazyGridState
import androidx.compose.foundation.pager.HorizontalPager
import androidx.compose.foundation.pager.rememberPagerState
import androidx.compose.foundation.shape.GenericShape
import androidx.compose.material.icons.Icons
import androidx.compose.material.icons.automirrored.filled.KeyboardArrowLeft
@@ -46,8 +47,6 @@ import androidx.compose.material.icons.automirrored.filled.KeyboardArrowRight
import androidx.compose.material.icons.outlined.Delete
import androidx.compose.material.icons.outlined.PushPin
import androidx.compose.material3.ButtonDefaults
import androidx.compose.material3.Icon
import androidx.compose.material3.LocalContentColor
import androidx.compose.material3.Tab
import androidx.compose.material3.TabRow
import androidx.compose.material3.TabRowDefaults
@@ -55,6 +54,7 @@ import androidx.compose.material3.TabRowDefaults.tabIndicatorOffset
import androidx.compose.material3.Text
import androidx.compose.runtime.Composable
import androidx.compose.runtime.CompositionLocalProvider
import androidx.compose.runtime.LaunchedEffect
import androidx.compose.runtime.collectAsState
import androidx.compose.runtime.getValue
import androidx.compose.runtime.key
@@ -63,6 +63,7 @@ import androidx.compose.runtime.mutableStateOf
import androidx.compose.runtime.remember
import androidx.compose.runtime.rememberCoroutineScope
import androidx.compose.runtime.setValue
import androidx.compose.runtime.snapshotFlow
import androidx.compose.ui.Alignment
import androidx.compose.ui.Modifier
import androidx.compose.ui.graphics.Color
@@ -172,7 +173,6 @@ fun EmojiPaletteView(
}
}
var recentlyUsedVersion by remember { mutableIntStateOf(0) }
val lazyListState = rememberLazyGridState()
val scope = rememberCoroutineScope()
@Composable
@@ -207,22 +207,110 @@ fun EmojiPaletteView(
)
}
Column(modifier = modifier) {
fun calculatePageNumbers(): Int {
return when {
!emojiHistoryEnabled -> EmojiCategoryValues.size - 1
else -> EmojiCategoryValues.size
}
}
fun pageNumberToCategory(pageNumber: Int): EmojiCategory {
return when {
!emojiHistoryEnabled -> EmojiCategoryValues[pageNumber+1]
else -> EmojiCategoryValues[pageNumber]
}
}
fun categoryToPageNumber(category: EmojiCategory): Int {
return if (emojiHistoryEnabled) {
EmojiCategoryValues.indexOf(category)
} else {
EmojiCategoryValues.indexOf(category) - 1
}
}
@Composable
fun EmojiCategoriesTabRow(
activeCategory: EmojiCategory,
onCategoryChange: (EmojiCategory) -> Unit,
) {
val inputFeedbackController = LocalInputFeedbackController.current
val selectedTabIndex = categoryToPageNumber(activeCategory)
val style = rememberSnyggThemeQuery(FlorisImeUi.MediaEmojiTab.elementName)
TabRow(
modifier = Modifier
.fillMaxWidth()
.height(FlorisImeSizing.smartbarHeight),
selectedTabIndex = selectedTabIndex,
containerColor = Color.Transparent,
contentColor = style.foreground(),
indicator = { tabPositions ->
val style = rememberSnyggThemeQuery(
elementName = FlorisImeUi.MediaEmojiTab.elementName,
selector = SnyggSelector.FOCUS,
)
TabRowDefaults.PrimaryIndicator(
Modifier.tabIndicatorOffset(tabPositions[selectedTabIndex]),
height = 4.dp,
color = style.foreground(),
)
},
) {
for (category in EmojiCategoryValues) {
if (category == EmojiCategory.RECENTLY_USED && !emojiHistoryEnabled) {
continue
}
Tab(
onClick = {
inputFeedbackController.keyPress(TextKeyData.UNSPECIFIED)
onCategoryChange(category)
},
selected = activeCategory == category,
icon = { SnyggIcon(
elementName = FlorisImeUi.MediaEmojiTab.elementName,
selector = if (activeCategory == category) SnyggSelector.FOCUS else SnyggSelector.NONE,
modifier = Modifier.size(ButtonDefaults.IconSize),
imageVector = category.icon(),
) },
)
}
}
}
Column(
modifier = modifier
) {
val pagerState = rememberPagerState(
pageCount = { calculatePageNumbers() }
)
// Reset the pager to the first page when emojiHistory is enabled
LaunchedEffect(emojiHistoryEnabled) {
pagerState.animateScrollToPage(0)
}
EmojiCategoriesTabRow(
activeCategory = activeCategory,
onCategoryChange = { category ->
scope.launch { lazyListState.scrollToItem(0) }
activeCategory = category
scope.launch { pagerState.animateScrollToPage(categoryToPageNumber(activeCategory)) }
},
emojiHistoryEnabled = emojiHistoryEnabled,
)
HorizontalPager(pagerState, beyondViewportPageCount = 1) { page ->
// Every page needs its own lazyGridState in order to scroll correctly
val lazyGridState = rememberLazyGridState()
Box(
modifier = Modifier
.fillMaxWidth()
.weight(1f),
) {
val emojiMapping = if (activeCategory == EmojiCategory.RECENTLY_USED) {
// Update the lazyGridState and active category on scroll
LaunchedEffect(pagerState) {
snapshotFlow { pagerState.currentPage }.collect { page ->
lazyGridState.scrollToItem(0)
activeCategory = pageNumberToCategory(page)
}
}
val category = pageNumberToCategory(page)
val emojiMapping = if (category == EmojiCategory.RECENTLY_USED) {
// Purposely using remember here to prevent recomposition, as this would cause rapid
// emoji changes for the user when in recently used category.
remember(recentlyUsedVersion) {
@@ -237,63 +325,68 @@ fun EmojiPaletteView(
EmojiMappingForView(
pinned = emptyList(),
recent = emptyList(),
simple = emojiMappings[activeCategory]!!,
simple = emojiMappings[category]!!,
)
}
val isEmojiHistoryEmpty = emojiMapping.pinned.isEmpty() && emojiMapping.recent.isEmpty()
if (activeCategory == EmojiCategory.RECENTLY_USED && deviceLocked) {
Column(
modifier = Modifier
.fillMaxSize()
.padding(all = 8.dp),
) {
Text(
text = stringRes(R.string.emoji__history__phone_locked_message),
)
}
} else if (activeCategory == EmojiCategory.RECENTLY_USED && isEmojiHistoryEmpty) {
Column(
modifier = Modifier
.fillMaxSize()
.padding(all = 8.dp),
) {
Text(
text = stringRes(R.string.emoji__history__empty_message),
)
Text(
modifier = Modifier.padding(top = 8.dp),
text = stringRes(R.string.emoji__history__usage_tip),
fontStyle = FontStyle.Italic,
)
}
} else key(emojiMapping) {
CompositionLocalProvider(LocalLayoutDirection provides LayoutDirection.Ltr) {
LazyVerticalGrid(
when (category) {
EmojiCategory.RECENTLY_USED if deviceLocked -> {
Column(
modifier = Modifier
.fillMaxSize()
.florisScrollbar(lazyListState),
columns = GridCells.Adaptive(minSize = EmojiBaseWidth),
state = lazyListState,
.padding(all = 8.dp),
) {
if (emojiMapping.pinned.isNotEmpty()) {
header("header_pinned") {
GridHeader(text = stringRes(R.string.emoji__history__pinned))
Text(
text = stringRes(R.string.emoji__history__phone_locked_message),
)
}
}
EmojiCategory.RECENTLY_USED if isEmojiHistoryEmpty -> {
Column(
modifier = Modifier
.fillMaxSize()
.padding(all = 8.dp),
) {
Text(
text = stringRes(R.string.emoji__history__empty_message),
)
Text(
modifier = Modifier.padding(top = 8.dp),
text = stringRes(R.string.emoji__history__usage_tip),
fontStyle = FontStyle.Italic,
)
}
}
else -> key(emojiMapping) {
CompositionLocalProvider(LocalLayoutDirection provides LayoutDirection.Ltr) {
LazyVerticalGrid(
modifier = Modifier
.fillMaxSize()
.florisScrollbar(lazyGridState),
columns = GridCells.Adaptive(minSize = EmojiBaseWidth),
state = lazyGridState,
) {
if (emojiMapping.pinned.isNotEmpty()) {
header("header_pinned") {
GridHeader(text = stringRes(R.string.emoji__history__pinned))
}
items(emojiMapping.pinned) { emojiSet ->
EmojiKeyWrapper(emojiSet, isPinned = true)
}
}
items(emojiMapping.pinned) { emojiSet ->
EmojiKeyWrapper(emojiSet, isPinned = true)
if (emojiMapping.recent.isNotEmpty()) {
header("header_recent") {
GridHeader(text = stringRes(R.string.emoji__history__recent))
}
items(emojiMapping.recent) { emojiSet ->
EmojiKeyWrapper(emojiSet, isRecent = true)
}
}
}
if (emojiMapping.recent.isNotEmpty()) {
header("header_recent") {
GridHeader(text = stringRes(R.string.emoji__history__recent))
}
items(emojiMapping.recent) { emojiSet ->
EmojiKeyWrapper(emojiSet, isRecent = true)
}
}
if (emojiMapping.simple.isNotEmpty()) {
items(emojiMapping.simple) { emojiSet ->
EmojiKeyWrapper(emojiSet)
if (emojiMapping.simple.isNotEmpty()) {
items(emojiMapping.simple) { emojiSet ->
EmojiKeyWrapper(emojiSet)
}
}
}
}
@@ -303,59 +396,6 @@ fun EmojiPaletteView(
}
}
@Composable
private fun EmojiCategoriesTabRow(
activeCategory: EmojiCategory,
onCategoryChange: (EmojiCategory) -> Unit,
emojiHistoryEnabled: Boolean,
) {
val inputFeedbackController = LocalInputFeedbackController.current
val selectedTabIndex = if (emojiHistoryEnabled) {
EmojiCategoryValues.indexOf(activeCategory)
} else {
EmojiCategoryValues.indexOf(activeCategory) - 1
}
val style = rememberSnyggThemeQuery(FlorisImeUi.MediaEmojiTab.elementName)
TabRow(
modifier = Modifier
.fillMaxWidth()
.height(FlorisImeSizing.smartbarHeight),
selectedTabIndex = selectedTabIndex,
containerColor = Color.Transparent,
contentColor = style.foreground(),
indicator = { tabPositions ->
val style = rememberSnyggThemeQuery(
elementName = FlorisImeUi.MediaEmojiTab.elementName,
selector = SnyggSelector.FOCUS,
)
TabRowDefaults.PrimaryIndicator(
Modifier.tabIndicatorOffset(tabPositions[selectedTabIndex]),
height = 4.dp,
color = style.foreground(),
)
},
) {
for (category in EmojiCategoryValues) {
if (category == EmojiCategory.RECENTLY_USED && !emojiHistoryEnabled) {
continue
}
Tab(
onClick = {
inputFeedbackController.keyPress(TextKeyData.UNSPECIFIED)
onCategoryChange(category)
},
selected = activeCategory == category,
icon = { SnyggIcon(
elementName = FlorisImeUi.MediaEmojiTab.elementName,
selector = if (activeCategory == category) SnyggSelector.FOCUS else SnyggSelector.NONE,
modifier = Modifier.size(ButtonDefaults.IconSize),
imageVector = category.icon(),
) },
)
}
}
}
@Composable
private fun EmojiKey(
emojiSet: EmojiSet,

View File

@@ -23,22 +23,35 @@ import android.inputmethodservice.InputMethodService
import android.view.Window
import androidx.compose.runtime.Composable
import androidx.compose.runtime.LaunchedEffect
import androidx.compose.ui.graphics.luminance
import androidx.compose.ui.platform.LocalView
import androidx.core.view.WindowInsetsControllerCompat
import dev.patrickgold.florisboard.ime.theme.FlorisImeTheme
import dev.patrickgold.florisboard.ime.theme.FlorisImeUi
import org.florisboard.lib.android.AndroidVersion
import org.florisboard.lib.snygg.ui.rememberSnyggThemeQuery
import org.florisboard.lib.snygg.ui.uriOrNull
@Composable
fun SystemUiIme() {
val useDarkIcons = !FlorisImeTheme.config.isNightTheme
val backgroundQuery = rememberSnyggThemeQuery(FlorisImeUi.Window.elementName)
val backgroundColor = backgroundQuery.background()
val backgroundImage = backgroundQuery.backgroundImage.uriOrNull()
val hasBackgroundImage = backgroundImage != null
val useDarkIcons = if (backgroundImage == null) {
backgroundColor.luminance() >= 0.5
} else {
false
}
val view = LocalView.current
val window = view.context.findWindow()!!
val windowInsetsController = WindowInsetsControllerCompat(window, view)
LaunchedEffect(useDarkIcons) {
LaunchedEffect(useDarkIcons, hasBackgroundImage) {
windowInsetsController.isAppearanceLightNavigationBars = useDarkIcons
if (AndroidVersion.ATLEAST_API29_Q) {
window.isNavigationBarContrastEnforced = true
window.isNavigationBarContrastEnforced = hasBackgroundImage
}
}
}

View File

@@ -48,7 +48,6 @@ import androidx.compose.ui.unit.takeOrElse
import kotlinx.coroutines.runBlocking
import org.florisboard.lib.color.ColorMappings
import org.florisboard.lib.snygg.CompiledFontFamilyData
import org.florisboard.lib.snygg.SnyggAttributes
import org.florisboard.lib.snygg.SnyggQueryAttributes
import org.florisboard.lib.snygg.SnyggRule
import org.florisboard.lib.snygg.SnyggSelector
@@ -364,7 +363,7 @@ internal fun SnyggValue.dpSize(default: Dp = Dp.Unspecified): Dp {
}
}
internal fun SnyggValue.uriOrNull(): String? {
fun SnyggValue.uriOrNull(): String? {
return when (this) {
is SnyggUriValue -> this.uri
else -> null

View File

@@ -1,828 +0,0 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "autocfg"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
[[package]]
name = "calendrical_calculations"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cec493b209a1b81fa32312d7ceca1b547d341c7b5f16a3edbf32b1d8b455bbdf"
dependencies = [
"core_maths",
"displaydoc",
]
[[package]]
name = "core_maths"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3b02505ccb8c50b0aa21ace0fc08c3e53adebd4e58caa18a36152803c7709a3"
dependencies = [
"libm",
]
[[package]]
name = "displaydoc"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "either"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
[[package]]
name = "fixed_decimal"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0febbeb1118a9ecdee6e4520ead6b54882e843dd0592ad233247dbee84c53db8"
dependencies = [
"displaydoc",
"smallvec",
"writeable",
]
[[package]]
name = "flest"
version = "0.1.0"
dependencies = [
"textutils",
]
[[package]]
name = "icu"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dff5e3018d703f168b00dcefa540a65f1bbc50754ae32f3f5f0e43fe5ee51502"
dependencies = [
"icu_calendar",
"icu_casemap",
"icu_collator",
"icu_collections",
"icu_datetime",
"icu_decimal",
"icu_experimental",
"icu_list",
"icu_locid",
"icu_locid_transform",
"icu_normalizer",
"icu_plurals",
"icu_properties",
"icu_provider",
"icu_segmenter",
"icu_timezone",
]
[[package]]
name = "icu_calendar"
version = "1.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7265b2137f9a36f7634a308d91f984574bbdba8cfd95ceffe1c345552275a8ff"
dependencies = [
"calendrical_calculations",
"displaydoc",
"icu_calendar_data",
"icu_locid",
"icu_locid_transform",
"icu_provider",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_calendar_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e009b7f0151ee6fb28c40b1283594397e0b7183820793e9ace3dcd13db126d0"
[[package]]
name = "icu_casemap"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ff0c8ae9f8d31b12e27fc385ff9ab1f3cd9b17417c665c49e4ec958c37da75f"
dependencies = [
"displaydoc",
"icu_casemap_data",
"icu_collections",
"icu_locid",
"icu_properties",
"icu_provider",
"writeable",
"zerovec",
]
[[package]]
name = "icu_casemap_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d57966d5ab748f74513be4046867f9a20e801e2775d41f91d04a0f560b61f08"
[[package]]
name = "icu_collator"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d370371887d31d56f361c3eaa15743e54f13bc677059c9191c77e099ed6966b2"
dependencies = [
"displaydoc",
"icu_collator_data",
"icu_collections",
"icu_locid_transform",
"icu_normalizer",
"icu_properties",
"icu_provider",
"smallvec",
"utf16_iter",
"utf8_iter",
"zerovec",
]
[[package]]
name = "icu_collator_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ee3f88741364b7d6269cce6827a3e6a8a2cf408a78f766c9224ab479d5e4ae5"
[[package]]
name = "icu_collections"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
dependencies = [
"displaydoc",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_datetime"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d115efb85e08df3fd77e77f52e7e087545a783fffba8be80bfa2102f306b1780"
dependencies = [
"displaydoc",
"either",
"fixed_decimal",
"icu_calendar",
"icu_datetime_data",
"icu_decimal",
"icu_locid",
"icu_locid_transform",
"icu_plurals",
"icu_provider",
"icu_timezone",
"smallvec",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_datetime_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2ba7e7f7a01269b9afb0a39eff4f8676f693b55f509b3120e43a0350a9f88bea"
[[package]]
name = "icu_decimal"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb8fd98f86ec0448d85e1edf8884e4e318bb2e121bd733ec929a05c0a5e8b0eb"
dependencies = [
"displaydoc",
"fixed_decimal",
"icu_decimal_data",
"icu_locid_transform",
"icu_provider",
"writeable",
]
[[package]]
name = "icu_decimal_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d424c994071c6f5644f999925fc868c85fec82295326e75ad5017bc94b41523"
[[package]]
name = "icu_experimental"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "844ad7b682a165c758065d694bc4d74ac67f176da1c499a04d85d492c0f193b7"
dependencies = [
"displaydoc",
"fixed_decimal",
"icu_collections",
"icu_decimal",
"icu_experimental_data",
"icu_locid",
"icu_locid_transform",
"icu_normalizer",
"icu_pattern",
"icu_plurals",
"icu_properties",
"icu_provider",
"litemap",
"num-bigint",
"num-rational",
"num-traits",
"smallvec",
"tinystr",
"writeable",
"zerofrom",
"zerotrie",
"zerovec",
]
[[package]]
name = "icu_experimental_data"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c178b9a34083fca5bd70d61f647575335e9c197d0f30c38e8ccd187babc69d0"
[[package]]
name = "icu_list"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbfeda1d7775b6548edd4e8b7562304a559a91ed56ab56e18961a053f367c365"
dependencies = [
"displaydoc",
"icu_list_data",
"icu_locid_transform",
"icu_provider",
"regex-automata 0.2.0",
"writeable",
]
[[package]]
name = "icu_list_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1825170d2c6679cb20dbd96a589d034e49f698aed9a2ef4fafc9a0101ed298f"
[[package]]
name = "icu_locid"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
dependencies = [
"displaydoc",
"litemap",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_locid_transform"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
dependencies = [
"displaydoc",
"icu_locid",
"icu_locid_transform_data",
"icu_provider",
"tinystr",
"zerovec",
]
[[package]]
name = "icu_locid_transform_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
[[package]]
name = "icu_normalizer"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
dependencies = [
"displaydoc",
"icu_collections",
"icu_normalizer_data",
"icu_properties",
"icu_provider",
"smallvec",
"utf16_iter",
"utf8_iter",
"write16",
"zerovec",
]
[[package]]
name = "icu_normalizer_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
[[package]]
name = "icu_pattern"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb7f36aafd098d6717de34e668a8120822275c1fba22b936e757b7de8a2fd7e4"
dependencies = [
"displaydoc",
"either",
"writeable",
"yoke",
"zerofrom",
]
[[package]]
name = "icu_plurals"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba5a70e7c025dbd5c501b0a5c188cd11666a424f0dadcd4f0a95b7dafde3b114"
dependencies = [
"displaydoc",
"fixed_decimal",
"icu_locid_transform",
"icu_plurals_data",
"icu_provider",
"zerovec",
]
[[package]]
name = "icu_plurals_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e3e8f775b215d45838814a090a2227247a7431d74e9156407d9c37f6ef0f208"
[[package]]
name = "icu_properties"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5"
dependencies = [
"displaydoc",
"icu_collections",
"icu_locid_transform",
"icu_properties_data",
"icu_provider",
"tinystr",
"zerovec",
]
[[package]]
name = "icu_properties_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
[[package]]
name = "icu_provider"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
dependencies = [
"displaydoc",
"icu_locid",
"icu_provider_macros",
"stable_deref_trait",
"tinystr",
"writeable",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_provider_macros"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "icu_segmenter"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a717725612346ffc2d7b42c94b820db6908048f39434504cb130e8b46256b0de"
dependencies = [
"core_maths",
"displaydoc",
"icu_collections",
"icu_locid",
"icu_provider",
"icu_segmenter_data",
"utf8_iter",
"zerovec",
]
[[package]]
name = "icu_segmenter_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f739ee737260d955e330bc83fdeaaf1631f7fb7ed218761d3c04bb13bb7d79df"
[[package]]
name = "icu_timezone"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa91ba6a585939a020c787235daa8aee856d9bceebd6355e283c0c310bc6de96"
dependencies = [
"displaydoc",
"icu_calendar",
"icu_provider",
"icu_timezone_data",
"tinystr",
"zerotrie",
"zerovec",
]
[[package]]
name = "icu_timezone_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c588878c508a3e2ace333b3c50296053e6483c6a7541251b546cc59dcd6ced8e"
[[package]]
name = "itertools"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
dependencies = [
"either",
]
[[package]]
name = "lazy_static"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "libm"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
[[package]]
name = "linkify"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1dfa36d52c581e9ec783a7ce2a5e0143da6237be5811a0b3153fedfdbe9f780"
dependencies = [
"memchr",
]
[[package]]
name = "litemap"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "num-bigint"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
dependencies = [
"num-integer",
"num-traits",
]
[[package]]
name = "num-integer"
version = "0.1.46"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
dependencies = [
"num-traits",
]
[[package]]
name = "num-rational"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
dependencies = [
"num-bigint",
"num-integer",
"num-traits",
]
[[package]]
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]]
name = "once_cell"
version = "1.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
[[package]]
name = "proc-macro2"
version = "1.0.88"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c3a7fc5db1e57d5a779a352c8cdb57b29aa4c40cc69c3a68a7fedc815fbf2f9"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
dependencies = [
"proc-macro2",
]
[[package]]
name = "regex"
version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata 0.4.8",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9368763f5a9b804326f3af749e16f9abf378d227bcdee7634b13d8f17793782"
dependencies = [
"memchr",
]
[[package]]
name = "regex-automata"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]]
name = "serde"
version = "1.0.210"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.210"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "smallvec"
version = "1.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "syn"
version = "2.0.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83540f837a8afc019423a8edb95b52a8effe46957ee402287f4292fae35be021"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "synstructure"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "textutils"
version = "0.1.0"
dependencies = [
"icu",
"icu_segmenter",
"itertools",
"lazy_static",
"linkify",
"once_cell",
"regex",
"unicase",
"unicode-normalization",
]
[[package]]
name = "tinystr"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
dependencies = [
"displaydoc",
"zerovec",
]
[[package]]
name = "tinyvec"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938"
dependencies = [
"tinyvec_macros",
]
[[package]]
name = "tinyvec_macros"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "unicase"
version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e51b68083f157f853b6379db119d1c1be0e6e4dec98101079dec41f6f5cf6df"
[[package]]
name = "unicode-ident"
version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
[[package]]
name = "unicode-normalization"
version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
dependencies = [
"tinyvec",
]
[[package]]
name = "utf16_iter"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]]
name = "utf8_iter"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
[[package]]
name = "write16"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
[[package]]
name = "writeable"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
dependencies = [
"either",
]
[[package]]
name = "yoke"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5"
dependencies = [
"serde",
"stable_deref_trait",
"yoke-derive",
"zerofrom",
]
[[package]]
name = "yoke-derive"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "zerofrom"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55"
dependencies = [
"zerofrom-derive",
]
[[package]]
name = "zerofrom-derive"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "zerotrie"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb594dd55d87335c5f60177cee24f19457a5ec10a065e0a3014722ad252d0a1f"
dependencies = [
"displaydoc",
"yoke",
"zerofrom",
]
[[package]]
name = "zerovec"
version = "0.10.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
dependencies = [
"yoke",
"zerofrom",
"zerovec-derive",
]
[[package]]
name = "zerovec-derive"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
dependencies = [
"proc-macro2",
"quote",
"syn",
]

View File

@@ -1,7 +0,0 @@
[package]
name = "flest"
version = "0.1.0"
edition = "2021"
[dependencies]
textutils = { path = "../textutils" }

View File

@@ -1,125 +0,0 @@
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Candidate {
pub text: String,
pub secondary_text: Option<String>,
pub confidence: u8,
}
impl PartialOrd for Candidate {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
// Reverse ordering
other.confidence.partial_cmp(&self.confidence)
}
}
impl Ord for Candidate {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
// Reverse ordering
other.confidence.cmp(&self.confidence)
}
}
pub struct CandidateQueue {
entries: Vec<Candidate>,
capacity: usize,
}
impl CandidateQueue {
pub fn with_capacity(capacity: usize) -> Self {
let capacity = capacity.max(1);
CandidateQueue {
entries: Vec::with_capacity(capacity),
capacity,
}
}
pub fn push(&mut self, text: String, confidence: f64) {
if confidence.is_nan() {
return;
}
let confidence = confidence.clamp(0.0, 1.0);
let confidence = ((u8::MAX as f64) * confidence) as u8;
let entry = Candidate { text, secondary_text: None, confidence };
if self.entries.is_empty() {
self.entries.push(entry);
return;
}
let existing_entry = self.entries.iter_mut().find(|it| it.text == entry.text);
if let Some(entry) = existing_entry {
entry.confidence = entry.confidence.max(confidence);
} else {
if self.entries.len() < self.capacity {
self.entries.push(entry);
} else {
let last = &mut self.entries[self.capacity - 1];
if last.confidence < entry.confidence {
*last = entry;
}
}
}
self.entries.sort();
}
pub fn into_sorted_vec(self) -> Vec<Candidate> {
self.entries
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn basic_insertions() {
let mut queue = CandidateQueue::with_capacity(3);
queue.push("foo".to_string(), 0.5);
queue.push("bar".to_string(), 0.7);
queue.push("baz".to_string(), 0.6);
queue.push("qux".to_string(), 0.8);
queue.push("quux".to_string(), 0.9);
let vec = queue.into_sorted_vec();
assert_eq!(vec.len(), 3);
assert_eq!(vec[0].text, "quux");
assert_eq!(vec[1].text, "qux");
assert_eq!(vec[2].text, "bar");
}
#[test]
fn basic_insertions_with_duplicates() {
let mut queue = CandidateQueue::with_capacity(3);
queue.push("foo".to_string(), 0.5);
queue.push("bar".to_string(), 0.7);
queue.push("baz".to_string(), 0.6);
queue.push("qux".to_string(), 0.8);
queue.push("quux".to_string(), 0.9);
queue.push("quux".to_string(), 0.9);
let vec = queue.into_sorted_vec();
assert_eq!(vec.len(), 3);
assert_eq!(vec[0].text, "quux");
assert_eq!(vec[1].text, "qux");
assert_eq!(vec[2].text, "bar");
}
#[test]
fn empty_candidate_set() {
let queue = CandidateQueue::with_capacity(3);
let vec = queue.into_sorted_vec();
assert_eq!(vec.len(), 0);
}
#[test]
fn nan_confidence_insertions() {
let mut queue = CandidateQueue::with_capacity(3);
queue.push("foo".to_string(), 0.5);
queue.push("bar".to_string(), f64::NAN);
queue.push("baz".to_string(), 0.6);
let vec = queue.into_sorted_vec();
assert_eq!(vec.len(), 2);
assert_eq!(vec[0].text, "baz");
assert_eq!(vec[1].text, "foo");
}
}

View File

@@ -1,148 +0,0 @@
use core::fmt;
use std::collections::HashMap;
pub const TOKEN_SEPARATOR: char = '\u{00}';
#[derive(Default)]
pub struct DynTrieNode<T> {
children: HashMap<char, DynTrieNode<T>>,
pub value: Option<T>,
}
impl<T: Default> DynTrieNode<T> {
pub fn new() -> Self {
DynTrieNode {
children: HashMap::new(),
value: None,
}
}
#[inline]
pub fn traverse(&self, c: char) -> Option<&DynTrieNode<T>> {
return self.children.get(&c);
}
#[inline]
pub fn traverse_mut(&mut self, c: char) -> Option<&mut DynTrieNode<T>> {
return self.children.get_mut(&c);
}
#[inline]
pub fn traverse_or_insert(&mut self, c: char) -> &mut DynTrieNode<T> {
return self.children.entry(c).or_insert_with(|| DynTrieNode::default());
}
pub fn get(&self, token: &Vec<char>) -> Option<&DynTrieNode<T>> {
let mut node = self;
for c in token {
node = node.traverse(*c)?;
}
return Some(node);
}
pub fn get_mut(&mut self, token: &Vec<char>) -> Option<&mut DynTrieNode<T>> {
let mut node = self;
for c in token {
node = node.traverse_mut(*c)?;
}
return Some(node);
}
pub fn get_or_insert(&mut self, token: &Vec<char>) -> &mut DynTrieNode<T> {
let mut node = self;
for c in token {
node = node.traverse_or_insert(*c);
}
if node.value.is_none() {
node.value = Some(T::default());
}
return node;
}
pub fn get_ngram(&self, ngram: &[Vec<char>]) -> Option<&DynTrieNode<T>> {
let mut node = self;
for (i, token) in ngram.iter().enumerate() {
if i > 0 {
node = node.traverse(TOKEN_SEPARATOR)?;
}
node = node.get(token)?;
}
return Some(node);
}
pub fn get_ngram_mut(&mut self, ngram: &[Vec<char>]) -> Option<&mut DynTrieNode<T>> {
let mut node = self;
for (i, token) in ngram.iter().enumerate() {
if i > 0 {
node = node.traverse_mut(TOKEN_SEPARATOR)?;
}
node = node.get_mut(token)?;
}
return Some(node);
}
pub fn get_ngram_or_insert(&mut self, ngram: &[Vec<char>]) -> &mut DynTrieNode<T> {
let mut node = self;
for (i, token) in ngram.iter().enumerate() {
if i > 0 {
node = node.traverse_or_insert(TOKEN_SEPARATOR);
}
node = node.get_or_insert(token);
}
return node;
}
pub fn for_each<F>(&self, f: &F) where F: Fn(&Vec<char>, &DynTrieNode<T>) {
let mut token = Vec::with_capacity(32);
self.for_each_recursive(&mut token, f);
}
fn for_each_recursive<F>(&self, token: &mut Vec<char>, f: &F) where F: Fn(&Vec<char>, &DynTrieNode<T>) {
if self.value.is_some() {
f(token, self);
}
for (c, child) in &self.children {
if *c == TOKEN_SEPARATOR {
continue;
}
token.push(*c);
child.for_each_recursive(token, f);
token.pop();
}
}
pub fn for_each_fnmut<F>(&self, f: &mut F) where F: FnMut(&Vec<char>, &DynTrieNode<T>) {
let mut token = Vec::with_capacity(32);
self.for_each_recursive_fnmut(&mut token, f);
}
fn for_each_recursive_fnmut<F>(&self, token: &mut Vec<char>, f: &mut F) where F: FnMut(&Vec<char>, &DynTrieNode<T>) {
if self.value.is_some() {
f(token, self);
}
for (c, child) in &self.children {
if *c == TOKEN_SEPARATOR {
continue;
}
token.push(*c);
child.for_each_recursive_fnmut(token, f);
token.pop();
}
}
}
impl<T: Default + fmt::Debug> DynTrieNode<T> {
pub fn debug_pretty_print(&self) {
self.debug_pretty_print_recursive(0);
}
fn debug_pretty_print_recursive(&self, depth: usize) {
for (c, child) in &self.children {
for _ in 0..depth {
print!(" ");
}
println!("{:?}: {:?}", c, child.value);
child.debug_pretty_print_recursive(depth + 1);
}
}
}

View File

@@ -1,4 +0,0 @@
mod candidates;
mod dyntrie;
pub mod model;
mod types;

View File

@@ -1,68 +0,0 @@
mod prediction;
mod serialization;
mod spellcheck;
mod training;
mod version;
pub use prediction::*;
pub use serialization::*;
pub use spellcheck::*;
pub use training::*;
pub use version::*;
use crate::dyntrie::DynTrieNode;
#[derive(Default, Debug)]
pub struct NgramData {
time: u64,
count: u64,
is_offensive: bool,
// flag should only be set for 1st level words!!
is_dictionary_word: bool,
}
pub struct NgramModelMeta {
version: NgramModelVersion,
global_time: u64,
global_count: u64,
pub sentence_token: String,
}
pub struct NgramModelOptions {
pub max_candidates: usize,
pub max_ngram_size: usize,
pub allow_offensive: bool,
}
pub struct NgramModel {
pub trie_root: DynTrieNode<NgramData>,
pub meta: NgramModelMeta,
pub options: NgramModelOptions,
}
impl NgramModel {
pub fn new() -> Self {
NgramModel {
trie_root: DynTrieNode::new(),
meta: NgramModelMeta {
version: NgramModelVersion::latest(),
global_time: 0,
global_count: 0,
sentence_token: "\u{1e}".to_owned(),
},
options: NgramModelOptions {
max_candidates: 5,
max_ngram_size: 3,
allow_offensive: false,
},
}
}
}
impl NgramModelMeta {
fn update_and_get_time(&mut self) -> u64 {
self.global_time += 1;
self.global_count += 1;
return self.global_time;
}
}

View File

@@ -1,149 +0,0 @@
use textutils::{fuzzy, normalization::StringNormalizationHelpers};
use crate::{candidates::{Candidate, CandidateQueue}, dyntrie::TOKEN_SEPARATOR};
use super::NgramModel;
impl NgramModel {
pub fn predict(&self, partial_sentence: &Vec<&str>) -> Vec<Candidate> {
if partial_sentence.is_empty() {
return vec![];
}
let mut partial_sentence_nfd = Vec::with_capacity(partial_sentence.len() + 1);
partial_sentence_nfd.push(self.meta.sentence_token.to_nfd_chars());
for word in partial_sentence {
partial_sentence_nfd.push(word.to_nfd_chars());
}
let curr_word_nfd = &partial_sentence_nfd.last().unwrap();
let history_nfd = &partial_sentence_nfd[..partial_sentence_nfd.len() - 1];
if curr_word_nfd.is_empty() {
return self.predict_next_word(history_nfd);
}
return self.predict_curr_word(curr_word_nfd, history_nfd);
}
fn predict_next_word(&self, history_nfd: &[Vec<char>]) -> Vec<Candidate> {
let mut candidate_queue = CandidateQueue::with_capacity(self.options.max_candidates);
let max_history_depth = (self.options.max_ngram_size - 1).min(history_nfd.len());
let tmax = self.meta.global_time;
let tmin = if tmax >= 300 { tmax - 300 } else { 0 };
let cmax = self.meta.global_count;
let cmin = 0;
self.trie_root.for_each_fnmut(&mut |word, word_node| {
let node = word_node.traverse(TOKEN_SEPARATOR);
if node.is_none() {
return;
}
let value = word_node.value.as_ref().unwrap();
let time_conf = norm_weight(value.time, tmin, tmax);
let count_conf = norm_weight(value.count, cmin, cmax);
let mut hist_node = node.unwrap();
for hist_index in 0..max_history_depth {
let hist_word = &history_nfd[history_nfd.len() - hist_index - 1];
// TODO: instead of get use fuzzy get with:
// case-insentive match and accent-insensitive match
let hist_node_opt = hist_node.get(hist_word);
if hist_node_opt.is_none() {
return;
}
hist_node = hist_node_opt.unwrap();
let hist_value = hist_node.value.as_ref();
if hist_value.is_none() {
return;
}
let hist_value = hist_value.unwrap();
let hist_time_conf = norm_weight(hist_value.time, tmin, tmax);
let hist_count_conf = norm_weight(hist_value.count, cmin, cmax);
let hist_conf = calc_confidence(hist_time_conf, hist_count_conf, 1.0);
let conf = calc_confidence(time_conf, count_conf, hist_conf);
candidate_queue.push(word.iter().collect(), conf);
let hist_node_opt = hist_node.traverse(TOKEN_SEPARATOR);
if hist_node_opt.is_none() {
return;
}
hist_node = hist_node_opt.unwrap();
}
});
return candidate_queue.into_sorted_vec();
}
fn predict_curr_word(&self, curr_word_nfd: &Vec<char>, history_nfd: &[Vec<char>]) -> Vec<Candidate> {
let mut candidate_queue = CandidateQueue::with_capacity(self.options.max_candidates);
let max_history_depth = (self.options.max_ngram_size - 1).min(history_nfd.len());
let tmax = self.meta.global_time;
let tmin = if tmax >= 300 { tmax - 300 } else { 0 };
let cmax = self.meta.global_count;
let cmin = 0;
// TODO: implement fuzzy_for_each_fnmut
self.trie_root.for_each_fnmut(&mut |word, word_node| {
// TODO: the fuzzy matcher needs to be written completely froms cratch, return a FuzzyResult instead of f64
if fuzzy::str_match_live(word, curr_word_nfd) < 0.5 {
return;
}
let node = word_node.traverse(TOKEN_SEPARATOR);
if node.is_none() {
return;
}
let value = word_node.value.as_ref().unwrap();
let time_conf = norm_weight(value.time, tmin, tmax);
let count_conf = norm_weight(value.count, cmin, cmax);
let mut hist_node = node.unwrap();
for hist_index in 0..max_history_depth {
let hist_word = &history_nfd[history_nfd.len() - hist_index - 1];
// TODO: instead of get use fuzzy get with:
// case-insentive match and accent-insensitive match
let hist_node_opt = hist_node.get(hist_word);
if hist_node_opt.is_none() {
return;
}
hist_node = hist_node_opt.unwrap();
let hist_value = hist_node.value.as_ref();
if hist_value.is_none() {
return;
}
let hist_value = hist_value.unwrap();
let hist_time_conf = norm_weight(hist_value.time, tmin, tmax);
let hist_count_conf = norm_weight(hist_value.count, cmin, cmax);
let hist_conf = calc_confidence(hist_time_conf, hist_count_conf, 1.0);
let conf = calc_confidence(time_conf, count_conf, hist_conf);
candidate_queue.push(word.iter().collect(), conf);
let hist_node_opt = hist_node.traverse(TOKEN_SEPARATOR);
if hist_node_opt.is_none() {
return;
}
hist_node = hist_node_opt.unwrap();
}
});
return candidate_queue.into_sorted_vec();
}
}
fn calc_confidence(time_conf: f64, count_conf: f64, hist_conf: f64) -> f64 {
println!("time_conf: {}, count_conf: {}, hist_conf: {}", time_conf, count_conf, hist_conf);
// TODO: count_conf is messed up
return 0.45 * time_conf + 0.10 * count_conf + 0.45 * hist_conf;
}
fn norm_weight(x: u64, xmin: u64, xmax: u64) -> f64 {
if x <= xmin {
return 0.0;
}
if x >= xmax {
return 1.0;
}
let xnorm = (x - xmin) as f64 / (xmax - xmin) as f64;
return 2.0 * xnorm - xnorm.powi(2);
}

View File

@@ -1,19 +0,0 @@
use crate::types::FlestResult;
use super::NgramModel;
impl NgramModel {
pub fn from_file(path: &str) -> FlestResult<Self> {
let mut model = NgramModel::new();
model.load_from_file(path)?;
return Ok(model);
}
pub fn load_from_file(&mut self, path: &str) -> FlestResult<()> {
todo!()
}
pub fn persist_to_file(&self, path: &str) -> FlestResult<()> {
todo!()
}
}

View File

@@ -1,13 +0,0 @@
use crate::candidates::Candidate;
use super::NgramModel;
impl NgramModel {
fn spell(&self, curr_word: &str, history: &Vec<&str>) -> Vec<Candidate> {
todo!()
}
fn spell_sentence(&self, sentence: &Vec<&str>) -> Vec<Option<Vec<Candidate>>> {
todo!()
}
}

View File

@@ -1,41 +0,0 @@
use textutils::normalization::StringNormalizationHelpers;
use super::NgramModel;
impl NgramModel {
pub fn train_from_sentence(&mut self, sentence: &Vec<&str>) {
let mut sentence_nfd = Vec::with_capacity(sentence.len() + 1);
sentence_nfd.push(self.meta.sentence_token.to_nfd_chars());
for word in sentence {
sentence_nfd.push(word.to_nfd_chars());
}
for sent_i in 0..sentence_nfd.len() {
for ngram_i in 0..=(self.options.max_ngram_size - 1).clamp(0, sent_i) {
let i = sent_i - ngram_i;
assert!(i < sentence_nfd.len()); // catch overflow issues
let ngram = &sentence_nfd[i..=sent_i].iter().rev().cloned().collect::<Vec<_>>();
let node = self.trie_root.get_ngram_or_insert(ngram);
let data = node.value.as_mut().unwrap();
data.time = self.meta.update_and_get_time();
data.count += 1;
}
}
}
pub fn train_from_tokens(&mut self, tokens: &Vec<&str>) {
for n in 1..=self.options.max_ngram_size {
if n > tokens.len() {
continue;
}
for i in 0..tokens.len() - n + 1 {
let ngram = &tokens[i..(i + n)];
let ngram = ngram.iter().rev().map(|&x| x.to_nfd_chars()).collect::<Vec<_>>();
let node = self.trie_root.get_ngram_or_insert(ngram.as_slice());
let data = node.value.as_mut().unwrap();
data.time = self.meta.update_and_get_time();
data.count += 1;
}
}
}
}

View File

@@ -1,70 +0,0 @@
use core::fmt;
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct NgramModelVersion {
major: u8,
minor: u8,
}
#[macro_export]
macro_rules! ngram_model_version {
($major:expr, $minor:expr) => {
NgramModelVersion { major: $major, minor: $minor }
};
($major:expr) => {
NgramModelVersion { major: $major, minor: 0 }
};
}
#[allow(non_upper_case_globals)]
impl NgramModelVersion {
pub const vDEV: NgramModelVersion = ngram_model_version!(0, 0);
pub const v0_1: NgramModelVersion = ngram_model_version!(0, 1);
pub fn latest() -> NgramModelVersion {
NgramModelVersion::v0_1
}
}
impl fmt::Display for NgramModelVersion {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.major == 0 && self.minor == 0 {
write!(f, "vDEV")
} else {
write!(f, "v{}.{}", self.major, self.minor)
}
}
}
impl fmt::Debug for NgramModelVersion {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{} (0x{:02x}{:02x})", self, self.major, self.minor)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn display() {
assert_eq!(format!("{}", ngram_model_version!(0, 1)), "v0.1");
assert_eq!(format!("{:?}", ngram_model_version!(0, 1)), "v0.1 (0x0001)");
assert_eq!(format!("{}", ngram_model_version!(1, 0)), "v1.0");
assert_eq!(format!("{:?}", ngram_model_version!(1, 0)), "v1.0 (0x0100)");
}
#[test]
fn equality() {
assert_eq!(ngram_model_version!(0, 1), ngram_model_version!(0, 1));
assert_eq!(ngram_model_version!(1, 0), ngram_model_version!(1, 0));
assert_ne!(ngram_model_version!(0, 1), ngram_model_version!(1, 0));
}
#[test]
fn comparison() {
assert!(ngram_model_version!(0, 1) > ngram_model_version!(0, 0));
assert!(ngram_model_version!(1, 0) > ngram_model_version!(0, 1));
assert!(ngram_model_version!(1, 0) > ngram_model_version!(0, 42));
}
}

View File

@@ -1,3 +0,0 @@
use std::error::Error;
pub type FlestResult<T> = Result<T, Box<dyn Error>>;

View File

@@ -1,821 +0,0 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "autocfg"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
[[package]]
name = "calendrical_calculations"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cec493b209a1b81fa32312d7ceca1b547d341c7b5f16a3edbf32b1d8b455bbdf"
dependencies = [
"core_maths",
"displaydoc",
]
[[package]]
name = "core_maths"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3b02505ccb8c50b0aa21ace0fc08c3e53adebd4e58caa18a36152803c7709a3"
dependencies = [
"libm",
]
[[package]]
name = "displaydoc"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "either"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
[[package]]
name = "fixed_decimal"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0febbeb1118a9ecdee6e4520ead6b54882e843dd0592ad233247dbee84c53db8"
dependencies = [
"displaydoc",
"smallvec",
"writeable",
]
[[package]]
name = "icu"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dff5e3018d703f168b00dcefa540a65f1bbc50754ae32f3f5f0e43fe5ee51502"
dependencies = [
"icu_calendar",
"icu_casemap",
"icu_collator",
"icu_collections",
"icu_datetime",
"icu_decimal",
"icu_experimental",
"icu_list",
"icu_locid",
"icu_locid_transform",
"icu_normalizer",
"icu_plurals",
"icu_properties",
"icu_provider",
"icu_segmenter",
"icu_timezone",
]
[[package]]
name = "icu_calendar"
version = "1.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7265b2137f9a36f7634a308d91f984574bbdba8cfd95ceffe1c345552275a8ff"
dependencies = [
"calendrical_calculations",
"displaydoc",
"icu_calendar_data",
"icu_locid",
"icu_locid_transform",
"icu_provider",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_calendar_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e009b7f0151ee6fb28c40b1283594397e0b7183820793e9ace3dcd13db126d0"
[[package]]
name = "icu_casemap"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ff0c8ae9f8d31b12e27fc385ff9ab1f3cd9b17417c665c49e4ec958c37da75f"
dependencies = [
"displaydoc",
"icu_casemap_data",
"icu_collections",
"icu_locid",
"icu_properties",
"icu_provider",
"writeable",
"zerovec",
]
[[package]]
name = "icu_casemap_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d57966d5ab748f74513be4046867f9a20e801e2775d41f91d04a0f560b61f08"
[[package]]
name = "icu_collator"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d370371887d31d56f361c3eaa15743e54f13bc677059c9191c77e099ed6966b2"
dependencies = [
"displaydoc",
"icu_collator_data",
"icu_collections",
"icu_locid_transform",
"icu_normalizer",
"icu_properties",
"icu_provider",
"smallvec",
"utf16_iter",
"utf8_iter",
"zerovec",
]
[[package]]
name = "icu_collator_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ee3f88741364b7d6269cce6827a3e6a8a2cf408a78f766c9224ab479d5e4ae5"
[[package]]
name = "icu_collections"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
dependencies = [
"displaydoc",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_datetime"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d115efb85e08df3fd77e77f52e7e087545a783fffba8be80bfa2102f306b1780"
dependencies = [
"displaydoc",
"either",
"fixed_decimal",
"icu_calendar",
"icu_datetime_data",
"icu_decimal",
"icu_locid",
"icu_locid_transform",
"icu_plurals",
"icu_provider",
"icu_timezone",
"smallvec",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_datetime_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2ba7e7f7a01269b9afb0a39eff4f8676f693b55f509b3120e43a0350a9f88bea"
[[package]]
name = "icu_decimal"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb8fd98f86ec0448d85e1edf8884e4e318bb2e121bd733ec929a05c0a5e8b0eb"
dependencies = [
"displaydoc",
"fixed_decimal",
"icu_decimal_data",
"icu_locid_transform",
"icu_provider",
"writeable",
]
[[package]]
name = "icu_decimal_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d424c994071c6f5644f999925fc868c85fec82295326e75ad5017bc94b41523"
[[package]]
name = "icu_experimental"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "844ad7b682a165c758065d694bc4d74ac67f176da1c499a04d85d492c0f193b7"
dependencies = [
"displaydoc",
"fixed_decimal",
"icu_collections",
"icu_decimal",
"icu_experimental_data",
"icu_locid",
"icu_locid_transform",
"icu_normalizer",
"icu_pattern",
"icu_plurals",
"icu_properties",
"icu_provider",
"litemap",
"num-bigint",
"num-rational",
"num-traits",
"smallvec",
"tinystr",
"writeable",
"zerofrom",
"zerotrie",
"zerovec",
]
[[package]]
name = "icu_experimental_data"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c178b9a34083fca5bd70d61f647575335e9c197d0f30c38e8ccd187babc69d0"
[[package]]
name = "icu_list"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbfeda1d7775b6548edd4e8b7562304a559a91ed56ab56e18961a053f367c365"
dependencies = [
"displaydoc",
"icu_list_data",
"icu_locid_transform",
"icu_provider",
"regex-automata 0.2.0",
"writeable",
]
[[package]]
name = "icu_list_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1825170d2c6679cb20dbd96a589d034e49f698aed9a2ef4fafc9a0101ed298f"
[[package]]
name = "icu_locid"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
dependencies = [
"displaydoc",
"litemap",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_locid_transform"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
dependencies = [
"displaydoc",
"icu_locid",
"icu_locid_transform_data",
"icu_provider",
"tinystr",
"zerovec",
]
[[package]]
name = "icu_locid_transform_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
[[package]]
name = "icu_normalizer"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
dependencies = [
"displaydoc",
"icu_collections",
"icu_normalizer_data",
"icu_properties",
"icu_provider",
"smallvec",
"utf16_iter",
"utf8_iter",
"write16",
"zerovec",
]
[[package]]
name = "icu_normalizer_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
[[package]]
name = "icu_pattern"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb7f36aafd098d6717de34e668a8120822275c1fba22b936e757b7de8a2fd7e4"
dependencies = [
"displaydoc",
"either",
"writeable",
"yoke",
"zerofrom",
]
[[package]]
name = "icu_plurals"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba5a70e7c025dbd5c501b0a5c188cd11666a424f0dadcd4f0a95b7dafde3b114"
dependencies = [
"displaydoc",
"fixed_decimal",
"icu_locid_transform",
"icu_plurals_data",
"icu_provider",
"zerovec",
]
[[package]]
name = "icu_plurals_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e3e8f775b215d45838814a090a2227247a7431d74e9156407d9c37f6ef0f208"
[[package]]
name = "icu_properties"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5"
dependencies = [
"displaydoc",
"icu_collections",
"icu_locid_transform",
"icu_properties_data",
"icu_provider",
"tinystr",
"zerovec",
]
[[package]]
name = "icu_properties_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
[[package]]
name = "icu_provider"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
dependencies = [
"displaydoc",
"icu_locid",
"icu_provider_macros",
"stable_deref_trait",
"tinystr",
"writeable",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_provider_macros"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "icu_segmenter"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a717725612346ffc2d7b42c94b820db6908048f39434504cb130e8b46256b0de"
dependencies = [
"core_maths",
"displaydoc",
"icu_collections",
"icu_locid",
"icu_provider",
"icu_segmenter_data",
"utf8_iter",
"zerovec",
]
[[package]]
name = "icu_segmenter_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f739ee737260d955e330bc83fdeaaf1631f7fb7ed218761d3c04bb13bb7d79df"
[[package]]
name = "icu_timezone"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa91ba6a585939a020c787235daa8aee856d9bceebd6355e283c0c310bc6de96"
dependencies = [
"displaydoc",
"icu_calendar",
"icu_provider",
"icu_timezone_data",
"tinystr",
"zerotrie",
"zerovec",
]
[[package]]
name = "icu_timezone_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c588878c508a3e2ace333b3c50296053e6483c6a7541251b546cc59dcd6ced8e"
[[package]]
name = "itertools"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
dependencies = [
"either",
]
[[package]]
name = "lazy_static"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "libm"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
[[package]]
name = "linkify"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1dfa36d52c581e9ec783a7ce2a5e0143da6237be5811a0b3153fedfdbe9f780"
dependencies = [
"memchr",
]
[[package]]
name = "litemap"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "num-bigint"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
dependencies = [
"num-integer",
"num-traits",
]
[[package]]
name = "num-integer"
version = "0.1.46"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
dependencies = [
"num-traits",
]
[[package]]
name = "num-rational"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
dependencies = [
"num-bigint",
"num-integer",
"num-traits",
]
[[package]]
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]]
name = "once_cell"
version = "1.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
[[package]]
name = "proc-macro2"
version = "1.0.88"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c3a7fc5db1e57d5a779a352c8cdb57b29aa4c40cc69c3a68a7fedc815fbf2f9"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
dependencies = [
"proc-macro2",
]
[[package]]
name = "regex"
version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata 0.4.8",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9368763f5a9b804326f3af749e16f9abf378d227bcdee7634b13d8f17793782"
dependencies = [
"memchr",
]
[[package]]
name = "regex-automata"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]]
name = "serde"
version = "1.0.210"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.210"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "smallvec"
version = "1.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "syn"
version = "2.0.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83540f837a8afc019423a8edb95b52a8effe46957ee402287f4292fae35be021"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "synstructure"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "textutils"
version = "0.1.0"
dependencies = [
"icu",
"icu_segmenter",
"itertools",
"lazy_static",
"linkify",
"once_cell",
"regex",
"unicase",
"unicode-normalization",
]
[[package]]
name = "tinystr"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
dependencies = [
"displaydoc",
"zerovec",
]
[[package]]
name = "tinyvec"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938"
dependencies = [
"tinyvec_macros",
]
[[package]]
name = "tinyvec_macros"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "unicase"
version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e51b68083f157f853b6379db119d1c1be0e6e4dec98101079dec41f6f5cf6df"
[[package]]
name = "unicode-ident"
version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
[[package]]
name = "unicode-normalization"
version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
dependencies = [
"tinyvec",
]
[[package]]
name = "utf16_iter"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]]
name = "utf8_iter"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
[[package]]
name = "write16"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
[[package]]
name = "writeable"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
dependencies = [
"either",
]
[[package]]
name = "yoke"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5"
dependencies = [
"serde",
"stable_deref_trait",
"yoke-derive",
"zerofrom",
]
[[package]]
name = "yoke-derive"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "zerofrom"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55"
dependencies = [
"zerofrom-derive",
]
[[package]]
name = "zerofrom-derive"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "zerotrie"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb594dd55d87335c5f60177cee24f19457a5ec10a065e0a3014722ad252d0a1f"
dependencies = [
"displaydoc",
"yoke",
"zerofrom",
]
[[package]]
name = "zerovec"
version = "0.10.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
dependencies = [
"yoke",
"zerofrom",
"zerovec-derive",
]
[[package]]
name = "zerovec-derive"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
dependencies = [
"proc-macro2",
"quote",
"syn",
]

View File

@@ -1,17 +0,0 @@
[package]
name = "textutils"
version = "0.1.0"
edition = "2021"
[dependencies]
icu = { version = "1.5.0", features = [
"compiled_data"
] }
icu_segmenter = "1.5.0"
itertools = "0.13.0"
lazy_static = "1.5.0"
linkify = "0.10.0"
once_cell = "1.20.2"
regex = "1.11.1"
unicase = "2.8.0"
unicode-normalization = "0.1.24"

View File

@@ -1,20 +0,0 @@
use lazy_static::lazy_static;
use linkify::{self, LinkFinder};
use regex::Regex;
lazy_static! {
static ref LINK_FINDER: LinkFinder = LinkFinder::new();
static ref REDDIT_REGEX: Regex = Regex::new(r"\/?(r\/[a-zA-Z0-9_]{3}[a-zA-Z0-9_]{0,18}|u\/[a-zA-Z0-9_-]{3}[a-zA-Z0-9_-]{0,17})").unwrap();
}
pub fn preprocess_auto(text: &str) -> String {
let mut cleaned_text = String::new();
let mut begin_cleaned_index = 0;
for span in LINK_FINDER.links(text) {
cleaned_text.push_str(&text[begin_cleaned_index..span.start()]);
begin_cleaned_index = span.end();
}
cleaned_text.push_str(&text[begin_cleaned_index..]);
cleaned_text = REDDIT_REGEX.replace_all(&cleaned_text, "").to_string();
return cleaned_text;
}

View File

@@ -1,292 +0,0 @@
use icu::properties::sets;
const NULLCHAR: char = '\0';
#[derive(PartialEq, Eq)]
pub enum FuzzyMatchStrategy {
MaxScore,
ScoreLhs,
ScoreRhs,
}
fn str_match_impl(word1: &[char], word2: &[char], strategy: FuzzyMatchStrategy) -> f64 {
let len1: usize = word1.len();
let len2: usize = word2.len();
let mut score1: f64 = 0.0;
let mut score2: f64 = 0.0;
let mut penalty: f64 = 0.0;
let mut last_penalty_awarded: f64 = 0.0;
let mut i1: usize = 0;
let mut i2: usize = 0;
let mut last_ch1: char = NULLCHAR;
let mut last_ch2: char = NULLCHAR;
fn next(word: &[char], i: &mut usize) -> char {
let mut ch: char;
loop {
ch = *word.get(*i).unwrap_or(&NULLCHAR);
if ch == NULLCHAR || !sets::diacritic().contains(ch) {
break;
}
*i += 1;
}
return ch;
}
while i1 < len1 && strategy != FuzzyMatchStrategy::ScoreRhs || i2 < len2 && strategy != FuzzyMatchStrategy::ScoreLhs {
let ch1 = next(word1, &mut i1);
let ch2 = next(word2, &mut i2);
if ch1 == NULLCHAR && ch2 == NULLCHAR {
break;
}
let mut ch_not_null = NULLCHAR;
if ch1 != NULLCHAR {
score1 += 1.0;
ch_not_null = ch1;
}
if ch2 != NULLCHAR {
score2 += 1.0;
ch_not_null = ch2;
}
if ch1 == NULLCHAR || ch2 == NULLCHAR {
if !sets::diacritic().contains(ch_not_null) {
penalty += 1.0;
}
i1 += 1;
i2 += 1;
continue;
}
if ch1 == ch2 {
// no penalty
} else if ch1.to_lowercase().eq(ch2.to_lowercase()) {
penalty += 0.1;
} else if ch1 == last_ch2 && ch2 == last_ch1 {
// transposition
// reduce penalty for transpositions
penalty -= 0.5 * last_penalty_awarded;
} else {
last_penalty_awarded = 1.0;
if last_ch1 == NULLCHAR && last_ch2 == NULLCHAR {
last_penalty_awarded += 1.0;
}
penalty += last_penalty_awarded;
}
i1 += 1;
i2 += 1;
last_ch1 = ch1;
last_ch2 = ch2;
}
let mut score = match strategy {
FuzzyMatchStrategy::ScoreLhs => score1,
FuzzyMatchStrategy::ScoreRhs => score2,
FuzzyMatchStrategy::MaxScore => f64::max(score1, score2),
};
if score == 0.0 {
// both strings essentially empty, thus they match
return 1.0;
}
score = 1.0 - penalty / score;
return f64::max(0.0, score);
}
#[inline]
pub fn str_match(word1: &[char], word2: &[char]) -> f64 {
return str_match_impl(word1, word2, FuzzyMatchStrategy::MaxScore);
}
#[inline]
pub fn str_match_live(base_word: &[char], curr_user_word: &[char]) -> f64 {
return str_match_impl(base_word, curr_user_word, FuzzyMatchStrategy::ScoreRhs);
}
#[allow(non_snake_case)]
#[cfg(test)]
mod tests {
use std::vec;
use crate::normalization::StringNormalizationHelpers;
use super::*;
#[test]
fn ascii_basic_match() {
let abc = "abc".to_nfd_chars();
let result = str_match(&abc, &abc);
assert_eq!(result, 1.0);
}
#[test]
fn ascii_basic_mismatch() {
let abc = "abc".to_nfd_chars();
let def = "def".to_nfd_chars();
let result = str_match(&abc, &def);
assert_eq!(result, 0.0);
}
#[test]
fn ascii_casing_diff_one_char() {
let a = "a".to_nfd_chars();
let A = "A".to_nfd_chars();
let result = str_match(&a, &A);
assert_eq!(result, 0.9);
}
#[test]
fn ascii_casing_diff_multiple_chars() {
let abc = "abc".to_nfd_chars();
let ABC = "ABC".to_nfd_chars();
let result = str_match(&abc, &ABC);
assert_eq!(result, 0.9);
}
#[test]
fn diacritic_basic_match_lowercase() {
let ae = "ä".to_nfd_chars();
let result = str_match(&ae, &ae);
assert_eq!(result, 1.0);
}
#[test]
fn diacritic_basic_match_uppercase() {
let AE = "Ä".to_nfd_chars();
let result = str_match(&AE, &AE);
assert_eq!(result, 1.0);
}
#[test]
fn diacritic_basic_mismatch_lowercase() {
let ae = "ä".to_nfd_chars();
let oe = "ö".to_nfd_chars();
let result = str_match(&ae, &oe);
assert_eq!(result, 0.0);
}
#[test]
fn diacritic_basic_mismatch_uppercase() {
let AE = "Ä".to_nfd_chars();
let OE = "Ö".to_nfd_chars();
let result = str_match(&AE, &OE);
assert_eq!(result, 0.0);
}
#[test]
fn diacritic_casing_and_accent_diff() {
let ae = "ä".to_nfd_chars();
let AE = "Ä".to_nfd_chars();
let a: Vec<char> = "a".to_nfd_chars();
let A = "A".to_nfd_chars();
let result = str_match(&ae, &AE);
assert_eq!(result, 0.9);
let result = str_match(&ae, &A);
assert_eq!(result, 0.9);
let result = str_match(&AE, &a);
assert_eq!(result, 0.9);
let result = str_match(&ae, &a);
assert_eq!(result, 1.0);
let result = str_match(&AE, &A);
assert_eq!(result, 1.0);
}
#[test]
fn empty_match() {
let empty = "".to_nfd_chars();
let result = str_match(&empty, &empty);
assert_eq!(result, 1.0);
}
#[test]
fn transposition_basic_1_start() {
let str1 = "abxx".to_nfd_chars();
let str2 = "baxx".to_nfd_chars();
let result = str_match(&str1, &str2);
assert_eq!(result, 0.75);
}
#[test]
fn transposition_basic_2_middle() {
let str1 = "xabx".to_nfd_chars();
let str2 = "xbax".to_nfd_chars();
let result = str_match(&str1, &str2);
assert_eq!(result, 0.875);
}
#[test]
fn transposition_basic_3_end() {
let str1 = "xxab".to_nfd_chars();
let str2 = "xxba".to_nfd_chars();
let result = str_match(&str1, &str2);
assert_eq!(result, 0.875);
}
#[test]
fn transposition_diactric_1_start() {
let str1 = "äbxx".to_nfd_chars();
let str2 = "bäxx".to_nfd_chars();
let result = str_match(&str1, &str2);
assert_eq!(result, 0.75);
}
#[test]
fn transposition_diactric_2_middle() {
let str1 = "xäbx".to_nfd_chars();
let str2 = "xbäx".to_nfd_chars();
let result = str_match(&str1, &str2);
assert_eq!(result, 0.875);
}
#[test]
fn transposition_diactric_3_end() {
let str1 = "xxäb".to_nfd_chars();
let str2 = "xxbä".to_nfd_chars();
let result = str_match(&str1, &str2);
assert_eq!(result, 0.875);
}
#[test]
fn unicode_normalization_basic_mismatch() {
let ae_nfd = "ä".to_nfd_chars();
let ae_nfc = "ä".to_nfc_chars();
let result = str_match(&ae_nfd, &ae_nfc);
assert_eq!(result, 0.0);
}
#[test]
fn words_english_many() {
let words = vec![
("hello", "hello", 1.0),
("hello", "hallo", 0.8),
("hello", "helo", 0.6),
];
for (word1, word2, expected_score) in words {
let result = str_match(&word1.to_nfd_chars(), &word2.to_nfd_chars());
assert_eq!(result, expected_score, "Mismatch for words '{}' and '{}'", word1, word2);
}
}
}

View File

@@ -1,54 +0,0 @@
pub mod filter;
pub mod fuzzy;
pub mod normalization;
pub mod properties;
pub mod segment;
#[cfg(test)]
mod tests {
use filter::preprocess_auto;
use icu_segmenter::{SentenceSegmenter, WordSegmenter};
use segment::{split_sentences, split_words};
use super::*;
#[test]
fn segment_sentences_simple() {
let text = "Hello, world! How are you? I'm fine.";
let segmenter = SentenceSegmenter::new();
let sentences = split_sentences(text, &segmenter);
assert_eq!(&sentences, &["Hello, world!", "How are you?", "I'm fine."]);
}
#[test]
fn segment_words_simple() {
let text = "Hello, world! How are you? I'm fine.";
let segmenter = WordSegmenter::new_auto();
let words = split_words(text, &segmenter);
assert_eq!(&words, &["Hello", "world", "How", "are", "you", "I'm", "fine"]);
}
#[test]
fn preprocess_auto_simple() {
let text = "Hello, world! How are you? I'm fine. https://example.com and more";
let cleaned_text = preprocess_auto(text);
assert_eq!(&cleaned_text, "Hello, world! How are you? I'm fine. and more");
}
#[test]
fn preprocess_reddit_ids() {
let text = "have a look at r/cats, user u/example posed a cute cat in there";
let cleaned_text = preprocess_auto(text);
assert_eq!(&cleaned_text, "have a look at , user posed a cute cat in there");
}
#[test]
fn preprocess_url_markdown() {
let text = "You can find an example [in the documentation](https://example.com) or on GitHub";
let cleaned_text = preprocess_auto(text);
assert_eq!(&cleaned_text, "You can find an example [in the documentation]() or on GitHub");
let segmenter = WordSegmenter::new_auto();
let words = split_words(&cleaned_text, &segmenter);
assert_eq!(&words, &["You", "can", "find", "an", "example", "in", "the", "documentation", "or", "on", "GitHub"]);
}
}

View File

@@ -1,61 +0,0 @@
use unicode_normalization::UnicodeNormalization;
pub trait StringNormalizationHelpers {
fn to_nfd_chars(&self) -> Vec<char>;
fn to_nfd_string(&self) -> String;
fn to_nfc_chars(&self) -> Vec<char>;
fn to_nfc_string(&self) -> String;
fn to_nfkd_chars(&self) -> Vec<char>;
fn to_nfkd_string(&self) -> String;
fn to_nfkc_chars(&self) -> Vec<char>;
fn to_nfkc_string(&self) -> String;
}
impl StringNormalizationHelpers for str {
#[inline]
fn to_nfd_chars(&self) -> Vec<char> {
self.nfd().collect()
}
#[inline]
fn to_nfd_string(&self) -> String {
self.nfd().collect()
}
#[inline]
fn to_nfc_chars(&self) -> Vec<char> {
self.nfc().collect()
}
#[inline]
fn to_nfc_string(&self) -> String {
self.nfc().collect()
}
#[inline]
fn to_nfkd_chars(&self) -> Vec<char> {
self.nfkd().collect()
}
#[inline]
fn to_nfkd_string(&self) -> String {
self.nfkd().collect()
}
#[inline]
fn to_nfkc_chars(&self) -> Vec<char> {
self.nfkc().collect()
}
#[inline]
fn to_nfkc_string(&self) -> String {
self.nfkc().collect()
}
}

View File

@@ -1,33 +0,0 @@
use icu::properties::sets::CodePointSetDataBorrowed;
pub use icu::properties::sets;
pub trait CodePointSetDataExt {
fn debug_print(&self);
fn debug_print_based(&self, base: char);
}
impl <'a> CodePointSetDataExt for CodePointSetDataBorrowed<'a> {
fn debug_print(&self) {
debug_print_impl(&self, None);
}
fn debug_print_based(&self, base: char) {
debug_print_impl(&self, Some(base));
}
}
fn debug_print_impl(set: &CodePointSetDataBorrowed, base: Option<char>) {
for range in set.iter_ranges() {
print!("{:#x}..={:#x}", range.start(), range.end());
for codepoint in range {
if let Some(base) = base {
print!(" {}{}", base, char::from_u32(codepoint).unwrap());
} else {
print!(" {}", char::from_u32(codepoint).unwrap());
}
}
println!();
}
}

View File

@@ -1,63 +0,0 @@
use icu_segmenter::{GraphemeClusterSegmenter, SentenceSegmenter, WordSegmenter};
use itertools::Itertools;
pub struct IcuSegmenterCache {
sentence_segmenter: SentenceSegmenter,
word_segmenter: WordSegmenter,
grapheme_cluster_segmenter: GraphemeClusterSegmenter,
}
impl IcuSegmenterCache {
pub fn new_auto() -> Self {
let sentence_segmenter = SentenceSegmenter::new();
let word_segmenter = WordSegmenter::new_auto();
let grapheme_cluster_segmenter = GraphemeClusterSegmenter::new();
return Self {
sentence_segmenter,
word_segmenter,
grapheme_cluster_segmenter,
};
}
pub fn split_sentences<'t>(&self, text: &'t str) -> Vec<&'t str> {
return split_sentences(text, &self.sentence_segmenter);
}
pub fn split_words<'t>(&self, text: &'t str) -> Vec<&'t str> {
return split_words(text, &self.word_segmenter);
}
pub fn split_grapheme_clusters<'t>(&self, text: &'t str) -> Vec<&'t str> {
return split_grapheme_clusters(text, &self.grapheme_cluster_segmenter);
}
}
pub fn split_sentences<'t>(text: &'t str, segmenter: &SentenceSegmenter) -> Vec<&'t str> {
let sentences: Vec<&str> = segmenter
.segment_str(text)
.tuple_windows()
.map(|(i, j)| text[i..j].trim())
.filter(|sentence| !sentence.is_empty())
.collect();
return sentences;
}
pub fn split_words<'t>(text: &'t str, segmenter: &WordSegmenter) -> Vec<&'t str> {
let words: Vec<&str> = segmenter
.segment_str(text)
.iter_with_word_type()
.tuple_windows()
.filter(|(_, (_, segment_type))| segment_type.is_word_like())
.map(|((i, _), (j, _))| &text[i..j])
.collect();
return words;
}
pub fn split_grapheme_clusters<'t>(text: &'t str, segmenter: &GraphemeClusterSegmenter) -> Vec<&'t str> {
let grapheme_clusters: Vec<&str> = segmenter
.segment_str(text)
.tuple_windows()
.map(|(i, j)| &text[i..j])
.collect();
return grapheme_clusters;
}

View File

@@ -1,961 +0,0 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "autocfg"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
[[package]]
name = "calendrical_calculations"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cec493b209a1b81fa32312d7ceca1b547d341c7b5f16a3edbf32b1d8b455bbdf"
dependencies = [
"core_maths",
"displaydoc",
]
[[package]]
name = "cc"
version = "1.1.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67b9470d453346108f93a59222a9a1a5724db32d0a4727b7ab7ace4b4d822dc9"
dependencies = [
"shlex",
]
[[package]]
name = "core_maths"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3b02505ccb8c50b0aa21ace0fc08c3e53adebd4e58caa18a36152803c7709a3"
dependencies = [
"libm",
]
[[package]]
name = "displaydoc"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "either"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
[[package]]
name = "fixed_decimal"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0febbeb1118a9ecdee6e4520ead6b54882e843dd0592ad233247dbee84c53db8"
dependencies = [
"displaydoc",
"smallvec",
"writeable",
]
[[package]]
name = "flest"
version = "0.1.0"
dependencies = [
"textutils",
]
[[package]]
name = "flesttools"
version = "0.1.0"
dependencies = [
"flest",
"pancurses",
"serde",
"serde_json",
"textutils",
]
[[package]]
name = "icu"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dff5e3018d703f168b00dcefa540a65f1bbc50754ae32f3f5f0e43fe5ee51502"
dependencies = [
"icu_calendar",
"icu_casemap",
"icu_collator",
"icu_collections",
"icu_datetime",
"icu_decimal",
"icu_experimental",
"icu_list",
"icu_locid",
"icu_locid_transform",
"icu_normalizer",
"icu_plurals",
"icu_properties",
"icu_provider",
"icu_segmenter",
"icu_timezone",
]
[[package]]
name = "icu_calendar"
version = "1.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7265b2137f9a36f7634a308d91f984574bbdba8cfd95ceffe1c345552275a8ff"
dependencies = [
"calendrical_calculations",
"displaydoc",
"icu_calendar_data",
"icu_locid",
"icu_locid_transform",
"icu_provider",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_calendar_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e009b7f0151ee6fb28c40b1283594397e0b7183820793e9ace3dcd13db126d0"
[[package]]
name = "icu_casemap"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ff0c8ae9f8d31b12e27fc385ff9ab1f3cd9b17417c665c49e4ec958c37da75f"
dependencies = [
"displaydoc",
"icu_casemap_data",
"icu_collections",
"icu_locid",
"icu_properties",
"icu_provider",
"writeable",
"zerovec",
]
[[package]]
name = "icu_casemap_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d57966d5ab748f74513be4046867f9a20e801e2775d41f91d04a0f560b61f08"
[[package]]
name = "icu_collator"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d370371887d31d56f361c3eaa15743e54f13bc677059c9191c77e099ed6966b2"
dependencies = [
"displaydoc",
"icu_collator_data",
"icu_collections",
"icu_locid_transform",
"icu_normalizer",
"icu_properties",
"icu_provider",
"smallvec",
"utf16_iter",
"utf8_iter",
"zerovec",
]
[[package]]
name = "icu_collator_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ee3f88741364b7d6269cce6827a3e6a8a2cf408a78f766c9224ab479d5e4ae5"
[[package]]
name = "icu_collections"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
dependencies = [
"displaydoc",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_datetime"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d115efb85e08df3fd77e77f52e7e087545a783fffba8be80bfa2102f306b1780"
dependencies = [
"displaydoc",
"either",
"fixed_decimal",
"icu_calendar",
"icu_datetime_data",
"icu_decimal",
"icu_locid",
"icu_locid_transform",
"icu_plurals",
"icu_provider",
"icu_timezone",
"smallvec",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_datetime_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2ba7e7f7a01269b9afb0a39eff4f8676f693b55f509b3120e43a0350a9f88bea"
[[package]]
name = "icu_decimal"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb8fd98f86ec0448d85e1edf8884e4e318bb2e121bd733ec929a05c0a5e8b0eb"
dependencies = [
"displaydoc",
"fixed_decimal",
"icu_decimal_data",
"icu_locid_transform",
"icu_provider",
"writeable",
]
[[package]]
name = "icu_decimal_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d424c994071c6f5644f999925fc868c85fec82295326e75ad5017bc94b41523"
[[package]]
name = "icu_experimental"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "844ad7b682a165c758065d694bc4d74ac67f176da1c499a04d85d492c0f193b7"
dependencies = [
"displaydoc",
"fixed_decimal",
"icu_collections",
"icu_decimal",
"icu_experimental_data",
"icu_locid",
"icu_locid_transform",
"icu_normalizer",
"icu_pattern",
"icu_plurals",
"icu_properties",
"icu_provider",
"litemap",
"num-bigint",
"num-rational",
"num-traits",
"smallvec",
"tinystr",
"writeable",
"zerofrom",
"zerotrie",
"zerovec",
]
[[package]]
name = "icu_experimental_data"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c178b9a34083fca5bd70d61f647575335e9c197d0f30c38e8ccd187babc69d0"
[[package]]
name = "icu_list"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbfeda1d7775b6548edd4e8b7562304a559a91ed56ab56e18961a053f367c365"
dependencies = [
"displaydoc",
"icu_list_data",
"icu_locid_transform",
"icu_provider",
"regex-automata 0.2.0",
"writeable",
]
[[package]]
name = "icu_list_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1825170d2c6679cb20dbd96a589d034e49f698aed9a2ef4fafc9a0101ed298f"
[[package]]
name = "icu_locid"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
dependencies = [
"displaydoc",
"litemap",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_locid_transform"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
dependencies = [
"displaydoc",
"icu_locid",
"icu_locid_transform_data",
"icu_provider",
"tinystr",
"zerovec",
]
[[package]]
name = "icu_locid_transform_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
[[package]]
name = "icu_normalizer"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
dependencies = [
"displaydoc",
"icu_collections",
"icu_normalizer_data",
"icu_properties",
"icu_provider",
"smallvec",
"utf16_iter",
"utf8_iter",
"write16",
"zerovec",
]
[[package]]
name = "icu_normalizer_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
[[package]]
name = "icu_pattern"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb7f36aafd098d6717de34e668a8120822275c1fba22b936e757b7de8a2fd7e4"
dependencies = [
"displaydoc",
"either",
"writeable",
"yoke",
"zerofrom",
]
[[package]]
name = "icu_plurals"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba5a70e7c025dbd5c501b0a5c188cd11666a424f0dadcd4f0a95b7dafde3b114"
dependencies = [
"displaydoc",
"fixed_decimal",
"icu_locid_transform",
"icu_plurals_data",
"icu_provider",
"zerovec",
]
[[package]]
name = "icu_plurals_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e3e8f775b215d45838814a090a2227247a7431d74e9156407d9c37f6ef0f208"
[[package]]
name = "icu_properties"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5"
dependencies = [
"displaydoc",
"icu_collections",
"icu_locid_transform",
"icu_properties_data",
"icu_provider",
"tinystr",
"zerovec",
]
[[package]]
name = "icu_properties_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
[[package]]
name = "icu_provider"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
dependencies = [
"displaydoc",
"icu_locid",
"icu_provider_macros",
"stable_deref_trait",
"tinystr",
"writeable",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_provider_macros"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "icu_segmenter"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a717725612346ffc2d7b42c94b820db6908048f39434504cb130e8b46256b0de"
dependencies = [
"core_maths",
"displaydoc",
"icu_collections",
"icu_locid",
"icu_provider",
"icu_segmenter_data",
"utf8_iter",
"zerovec",
]
[[package]]
name = "icu_segmenter_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f739ee737260d955e330bc83fdeaaf1631f7fb7ed218761d3c04bb13bb7d79df"
[[package]]
name = "icu_timezone"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa91ba6a585939a020c787235daa8aee856d9bceebd6355e283c0c310bc6de96"
dependencies = [
"displaydoc",
"icu_calendar",
"icu_provider",
"icu_timezone_data",
"tinystr",
"zerotrie",
"zerovec",
]
[[package]]
name = "icu_timezone_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c588878c508a3e2ace333b3c50296053e6483c6a7541251b546cc59dcd6ced8e"
[[package]]
name = "itertools"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
[[package]]
name = "lazy_static"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "libc"
version = "0.2.161"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1"
[[package]]
name = "libm"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
[[package]]
name = "linkify"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1dfa36d52c581e9ec783a7ce2a5e0143da6237be5811a0b3153fedfdbe9f780"
dependencies = [
"memchr",
]
[[package]]
name = "litemap"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
[[package]]
name = "log"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "ncurses"
version = "5.101.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e2c5d34d72657dc4b638a1c25d40aae81e4f1c699062f72f467237920752032"
dependencies = [
"cc",
"libc",
"pkg-config",
]
[[package]]
name = "num-bigint"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
dependencies = [
"num-integer",
"num-traits",
]
[[package]]
name = "num-integer"
version = "0.1.46"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
dependencies = [
"num-traits",
]
[[package]]
name = "num-rational"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
dependencies = [
"num-bigint",
"num-integer",
"num-traits",
]
[[package]]
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]]
name = "once_cell"
version = "1.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
[[package]]
name = "pancurses"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0352975c36cbacb9ee99bfb709b9db818bed43af57751797f8633649759d13db"
dependencies = [
"libc",
"log",
"ncurses",
"pdcurses-sys",
"winreg",
]
[[package]]
name = "pdcurses-sys"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "084dd22796ff60f1225d4eb6329f33afaf4c85419d51d440ab6b8c6f4529166b"
dependencies = [
"cc",
"libc",
]
[[package]]
name = "pkg-config"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
[[package]]
name = "proc-macro2"
version = "1.0.88"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c3a7fc5db1e57d5a779a352c8cdb57b29aa4c40cc69c3a68a7fedc815fbf2f9"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
dependencies = [
"proc-macro2",
]
[[package]]
name = "regex"
version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata 0.4.8",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9368763f5a9b804326f3af749e16f9abf378d227bcdee7634b13d8f17793782"
dependencies = [
"memchr",
]
[[package]]
name = "regex-automata"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]]
name = "ryu"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
[[package]]
name = "serde"
version = "1.0.210"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.210"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.132"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
]
[[package]]
name = "shlex"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "smallvec"
version = "1.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "syn"
version = "2.0.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83540f837a8afc019423a8edb95b52a8effe46957ee402287f4292fae35be021"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "synstructure"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "textutils"
version = "0.1.0"
dependencies = [
"icu",
"icu_segmenter",
"itertools",
"lazy_static",
"linkify",
"once_cell",
"regex",
"unicase",
"unicode-normalization",
]
[[package]]
name = "tinystr"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
dependencies = [
"displaydoc",
"zerovec",
]
[[package]]
name = "tinyvec"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938"
dependencies = [
"tinyvec_macros",
]
[[package]]
name = "tinyvec_macros"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "unicase"
version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e51b68083f157f853b6379db119d1c1be0e6e4dec98101079dec41f6f5cf6df"
[[package]]
name = "unicode-ident"
version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
[[package]]
name = "unicode-normalization"
version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
dependencies = [
"tinyvec",
]
[[package]]
name = "utf16_iter"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]]
name = "utf8_iter"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "winreg"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a27a759395c1195c4cc5cda607ef6f8f6498f64e78f7900f5de0a127a424704a"
dependencies = [
"winapi",
]
[[package]]
name = "write16"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
[[package]]
name = "writeable"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
dependencies = [
"either",
]
[[package]]
name = "yoke"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5"
dependencies = [
"serde",
"stable_deref_trait",
"yoke-derive",
"zerofrom",
]
[[package]]
name = "yoke-derive"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "zerofrom"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55"
dependencies = [
"zerofrom-derive",
]
[[package]]
name = "zerofrom-derive"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "zerotrie"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb594dd55d87335c5f60177cee24f19457a5ec10a065e0a3014722ad252d0a1f"
dependencies = [
"displaydoc",
"yoke",
"zerofrom",
]
[[package]]
name = "zerovec"
version = "0.10.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
dependencies = [
"yoke",
"zerofrom",
"zerovec-derive",
]
[[package]]
name = "zerovec-derive"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
dependencies = [
"proc-macro2",
"quote",
"syn",
]

View File

@@ -1,11 +0,0 @@
[package]
name = "flesttools"
version = "0.1.0"
edition = "2021"
[dependencies]
flest = { path = "../../libnative/flest" }
textutils = { path = "../../libnative/textutils" }
pancurses = { version = "0.17.0", features = ["wide"] }
serde = "1.0.203"
serde_json = "1.0.120"

View File

@@ -1,143 +0,0 @@
use flest::model::NgramModel;
use pancurses::Input;
use textutils::filter::preprocess_auto;
use textutils::segment::IcuSegmenterCache;
use std::env;
use std::fs;
use std::io::BufRead;
use std::io::BufReader;
const TOKEN_SENTENCE_SEPARATOR: &str = "\\sep";
fn tokenize_text(text: &str) -> Vec<&str> {
let segmenters = IcuSegmenterCache::new_auto();
let sentences = segmenters.split_sentences(text);
let mut tokens: Vec<&str> = Vec::new();
tokens.push(TOKEN_SENTENCE_SEPARATOR);
for sentence in sentences {
let words = segmenters.split_words(sentence);
for word in words {
tokens.push(word);
}
tokens.push(TOKEN_SENTENCE_SEPARATOR);
}
//println!("Tokens: {:?}", tokens);
return tokens;
}
fn train_model(text: &str, model: &mut NgramModel) {
let text = preprocess_auto(text);
let text = text.trim();
if text.is_empty() {
return;
}
let tokens = tokenize_text(&text);
//println!("Tokens: {:?}", tokens);
model.train_from_tokens(&tokens);
}
fn train_from_plain_text(path: &str, model: &mut NgramModel) {
let text = fs::read_to_string(path).expect("Failed to read file");
train_model(&text, model);
}
fn train_from_reddit_comments(path: &str, model: &mut NgramModel) {
let file = fs::File::open(path).expect("Failed to open file");
let reader = BufReader::new(file);
let mut line_count = 0;
for line in reader.lines() {
if let Ok(line) = line {
let json: serde_json::Value = serde_json::from_str(&line).expect("Failed to parse JSON");
if let Some(author) = json.get("author").and_then(|it| it.as_str()) {
if author == "AutoModerator" {
continue;
}
}
if let Some(body) = json.get("body").and_then(|it| it.as_str()) {
train_model(body, model);
}
}
line_count += 1;
if line_count > 10000 {
break;
}
}
}
fn main() {
let args: Vec<String> = env::args().collect();
if args.len() != 2 {
eprintln!("Usage: {} <file_path>", args[0]);
return;
}
let path = &args[1];
let mut model = NgramModel::new();
model.meta.sentence_token = TOKEN_SENTENCE_SEPARATOR.to_owned();
if path.ends_with(".reddit.jsonl") {
train_from_reddit_comments(path, &mut model);
} else {
train_from_plain_text(path, &mut model);
}
//model.trie_root.debug_pretty_print();
//return;
let window = pancurses::initscr();
let mut input_text = String::new();
pancurses::noecho();
window.keypad(true);
loop {
let mut words: Vec<&str> = input_text.split_whitespace().collect();
words.insert(0, TOKEN_SENTENCE_SEPARATOR);
if input_text.ends_with(' ') || words.last() == Some(&TOKEN_SENTENCE_SEPARATOR) {
words.push("");
}
let predictions = model.predict(&words);
window.clear();
window.addstr("N-gram model debug frontend\n");
window.addstr(" demo tokenizer only supports single-line sentence in input text!\n\n");
window.addstr(format!("enter text: {}\n", input_text));
window.addstr(format!("detected words: {:?}\n\n", words));
window.addstr("predictions:\n");
for (i, candidate) in predictions.iter().enumerate() {
if i == 0 && candidate.confidence > (0.9 * 255.0) as u8 {
window.attron(pancurses::A_BOLD);
}
window.addstr(format!(" {}. {} (c={:.2})\n", i + 1, candidate.text, candidate.confidence));
if i == 0 && candidate.confidence > (0.9 * 255.0) as u8 {
window.attroff(pancurses::A_BOLD);
}
}
if predictions.is_empty() {
window.addstr(" (none)\n");
}
window.mv(3, 12 + input_text.len() as i32);
window.refresh();
match window.getch().unwrap() {
Input::KeyF10 => {
break
}
Input::KeyBackspace => {
input_text.pop();
}
Input::Character('\n') => {
train_model(&input_text, &mut model)
}
Input::Character(ch) => {
input_text.push(ch)
}
_ => { () }
}
}
pancurses::endwin();
}

View File

@@ -1,27 +0,0 @@
#!/bin/bash
WORKSPACE_ROOT_DIR="$(realpath "$(dirname "$0")/..")"
VSCODE_DIR="$WORKSPACE_ROOT_DIR/.vscode"
VSCODE_SETTINGS_JSON_PATH="$VSCODE_DIR/settings.json"
if [ "$WORKSPACE_ROOT_DIR" != "$(pwd)" ]; then
echo "Not executing this script from workspace root dir!"
exit 1
fi
if [ ! -d "$VSCODE_DIR" ]; then
mkdir "$VSCODE_DIR"
fi
echo -en "{\n" > "$VSCODE_SETTINGS_JSON_PATH"
# <rust-analyzer>
rust_project_paths="$(find "$WORKSPACE_ROOT_DIR" -type f -name "Cargo.toml")"
echo -en " \"rust-analyzer.linkedProjects\": [\n" >> "$VSCODE_SETTINGS_JSON_PATH"
for rust_project_path in $rust_project_paths; do
echo -en " \"$rust_project_path\",\n" >> "$VSCODE_SETTINGS_JSON_PATH"
done
echo -en " ],\n" >> "$VSCODE_SETTINGS_JSON_PATH"
# </rust-analyzer>
echo -en "}\n" >> "$VSCODE_SETTINGS_JSON_PATH"