Compare commits
3 Commits
flest
...
feat/flest
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f41385ae75 | ||
|
|
5c85be61d9 | ||
|
|
639beb9e64 |
25
libnative/flest/Cargo.lock
generated
Normal file
25
libnative/flest/Cargo.lock
generated
Normal file
@@ -0,0 +1,25 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
||||
|
||||
[[package]]
|
||||
name = "flest"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"fxhash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fxhash"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
9
libnative/flest/Cargo.toml
Normal file
9
libnative/flest/Cargo.toml
Normal file
@@ -0,0 +1,9 @@
|
||||
[package]
|
||||
name = "flest"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
fxhash = "0.2.1"
|
||||
102
libnative/flest/src/dyntrie.rs
Normal file
102
libnative/flest/src/dyntrie.rs
Normal file
@@ -0,0 +1,102 @@
|
||||
use fxhash::FxHashMap;
|
||||
|
||||
#[derive(Default)]
|
||||
struct DynTrieNode<V> where V: Default {
|
||||
children: FxHashMap<char, Box<DynTrieNode<V>>>,
|
||||
value: Option<V>,
|
||||
}
|
||||
|
||||
impl<V> DynTrieNode<V> where V: Default {
|
||||
fn for_each_recursive<'a, F>(&'a self, current_word: &mut Vec<char>, f: &mut F)
|
||||
where F: FnMut(&[char], &'a V) {
|
||||
if let Some(value) = &self.value {
|
||||
f(¤t_word, value);
|
||||
}
|
||||
for (letter, node) in &self.children {
|
||||
current_word.push(*letter);
|
||||
node.for_each_recursive(current_word, f);
|
||||
current_word.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct DynTrie<V> where V: Default {
|
||||
root: DynTrieNode<V>,
|
||||
}
|
||||
|
||||
impl<V> DynTrie<V>
|
||||
where V: Default {
|
||||
pub fn find(&self, word: &[char]) -> Option<&V> {
|
||||
let mut current_node = &self.root;
|
||||
for letter in word {
|
||||
match current_node.children.get(letter) {
|
||||
Some(node) => current_node = node,
|
||||
None => return None,
|
||||
}
|
||||
}
|
||||
return current_node.value.as_ref();
|
||||
}
|
||||
|
||||
fn str_fuzzy_match_whole(str1: &[char], str2: &[char]) -> f64 {
|
||||
let len1 = str1.len();
|
||||
let len2 = str2.len();
|
||||
let max_len = std::cmp::max(len1, len2);
|
||||
let mut score: f64 = 0.0;
|
||||
let mut penalty: f64 = 0.0;
|
||||
for i in 0..max_len {
|
||||
let ch1 = str1.get(i).unwrap_or(&' ');
|
||||
let ch2 = str2.get(i).unwrap_or(&' ');
|
||||
if ch1 == ch2 {
|
||||
score += 1.0;
|
||||
} else if ch1.to_lowercase().eq(ch2.to_lowercase()) {
|
||||
score += 0.5;
|
||||
} else {
|
||||
penalty += if i == 0 { 2.0 } else { 1.0 };
|
||||
}
|
||||
}
|
||||
return f64::max(0.0, score - penalty)
|
||||
}
|
||||
|
||||
// TODO: optimization: we do not need to iterate over all
|
||||
// the trie, we can predict if the score will never be >= 0
|
||||
// and skip the whole subtree
|
||||
pub fn find_many(&self, word: &[char]) -> Vec<(Vec<char>, &V)> {
|
||||
let mut results = Vec::new();
|
||||
self.for_each(&mut |current_word, value| {
|
||||
let score = Self::str_fuzzy_match_whole(word, current_word);
|
||||
if score > 0.0 {
|
||||
results.push((current_word.to_owned(), value));
|
||||
}
|
||||
});
|
||||
return results;
|
||||
}
|
||||
|
||||
pub fn find_or_insert(&mut self, word: &[char], value: V) -> &mut V {
|
||||
let mut current_node = &mut self.root;
|
||||
for letter in word {
|
||||
current_node = current_node.children.entry(*letter)
|
||||
.or_insert_with(|| Box::new(DynTrieNode::default()));
|
||||
}
|
||||
if current_node.value.is_none() {
|
||||
current_node.value = Some(value);
|
||||
}
|
||||
return current_node.value.as_mut().unwrap();
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
fn insert(&mut self, word: &[char], value: V) {
|
||||
let mut current_node = &mut self.root;
|
||||
for letter in word {
|
||||
current_node = current_node.children.entry(*letter)
|
||||
.or_insert_with(|| Box::new(DynTrieNode::default()));
|
||||
}
|
||||
current_node.value = Some(value);
|
||||
}
|
||||
|
||||
pub fn for_each<'a, F>(&'a self, f: &mut F)
|
||||
where F: FnMut(&[char], &'a V) {
|
||||
let mut current_word: Vec<char> = Vec::new();
|
||||
self.root.for_each_recursive(&mut current_word, f);
|
||||
}
|
||||
}
|
||||
4
libnative/flest/src/lib.rs
Normal file
4
libnative/flest/src/lib.rs
Normal file
@@ -0,0 +1,4 @@
|
||||
mod dyntrie;
|
||||
mod ngrammodel;
|
||||
|
||||
pub use ngrammodel::*;
|
||||
212
libnative/flest/src/ngrammodel.rs
Normal file
212
libnative/flest/src/ngrammodel.rs
Normal file
@@ -0,0 +1,212 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::dyntrie::DynTrie;
|
||||
|
||||
#[derive(Default)]
|
||||
struct NgramModelNode {
|
||||
children: DynTrie<Box<NgramModelNode>>,
|
||||
time: u64,
|
||||
usage: u64,
|
||||
}
|
||||
|
||||
impl NgramModelNode {
|
||||
fn find(&self, ngram: &[&str]) -> Option<&NgramModelNode> {
|
||||
if ngram.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let token: Vec<char> = ngram[0].chars().collect();
|
||||
let child = self.children.find(&token);
|
||||
if child.is_none() {
|
||||
return None;
|
||||
}
|
||||
let child = child.unwrap();
|
||||
if ngram.len() == 1 {
|
||||
return Some(child);
|
||||
}
|
||||
return child.find(&ngram[1..]);
|
||||
}
|
||||
|
||||
fn find_many(&self, ngram: &[&str]) -> Vec<(Vec<char>, &NgramModelNode)> {
|
||||
if ngram.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
let token: Vec<char> = ngram[0].chars().collect();
|
||||
let ret = self.children.find_many(&token);
|
||||
if ngram.len() == 1 {
|
||||
return ret
|
||||
.into_iter()
|
||||
.map(|node| (node.0, node.1.as_ref()))
|
||||
.collect();
|
||||
}
|
||||
let mut ret2 = Vec::new();
|
||||
for (_, child) in &ret {
|
||||
ret2.extend(child.find_many(&ngram[1..]));
|
||||
}
|
||||
return ret2;
|
||||
}
|
||||
|
||||
fn train(&mut self, ngram: &[&str], current_time: u64) {
|
||||
if ngram.is_empty() {
|
||||
panic!("ngram must not be empty");
|
||||
}
|
||||
let token: Vec<char> = ngram[0].chars().collect();
|
||||
let child = self.children.find_or_insert(&token, Box::new(NgramModelNode::default()));
|
||||
if ngram.len() == 1 {
|
||||
if current_time != 0 {
|
||||
child.time = current_time;
|
||||
}
|
||||
child.usage += 1;
|
||||
} else {
|
||||
child.train(&ngram[1..], current_time);
|
||||
}
|
||||
}
|
||||
|
||||
fn debug_print(&self, _indent: usize) {
|
||||
// println!("{}{}{}", " ".repeat(indent), self.token, if self.time > 0 { "*" } else { "" });
|
||||
// for child in &self.children {
|
||||
// child.debug_print(indent + 1);
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct NgramModel {
|
||||
root: NgramModelNode,
|
||||
time: u64,
|
||||
}
|
||||
|
||||
impl NgramModel {
|
||||
#[allow(dead_code)]
|
||||
fn find(&self, ngram: &[&str]) -> Option<&NgramModelNode> {
|
||||
self.root.find(ngram)
|
||||
}
|
||||
|
||||
fn find_many(&self, ngram: &[&str]) -> Vec<(Vec<char>, &NgramModelNode)> {
|
||||
self.root.find_many(ngram)
|
||||
}
|
||||
|
||||
pub fn train_dataset(&mut self, token_list: &[&str]) {
|
||||
self.root.train(token_list, 0);
|
||||
}
|
||||
|
||||
pub fn train_input(&mut self, token_list: &[&str]) {
|
||||
self.time += 1;
|
||||
self.root.train(token_list, self.time);
|
||||
}
|
||||
|
||||
pub fn debug_print(&self) {
|
||||
self.root.debug_print(0);
|
||||
}
|
||||
|
||||
pub fn predict(&self, history: &Vec<&str>) -> Vec<(String, f64)> {
|
||||
let mut tmin = u64::MAX;
|
||||
let mut tmax = u64::MIN;
|
||||
let mut umin = u64::MAX;
|
||||
let mut umax = u64::MIN;
|
||||
let nmin = 1;
|
||||
let nmax = 3;
|
||||
let mut candidate_nodes: Vec<(Vec<char>, &NgramModelNode, f64)> = Vec::new();
|
||||
|
||||
let user_input_word = history.last().unwrap_or(&"");
|
||||
|
||||
for n in nmin..=std::cmp::min(history.len(), nmax) {
|
||||
let nweight = 1.0 - (nmax - n) as f64 * 0.1;
|
||||
let ngram = &history[history.len() - n..history.len() - 1];
|
||||
let nodes = self.find_many(ngram);
|
||||
for (_, node) in nodes {
|
||||
node.children.for_each(&mut |curr_word, child| {
|
||||
candidate_nodes.push((curr_word.to_owned(), child, nweight));
|
||||
tmin = tmin.min(child.time);
|
||||
tmax = tmax.max(child.time);
|
||||
umin = umin.min(child.usage);
|
||||
umax = umax.max(child.usage);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
candidate_nodes = candidate_nodes
|
||||
.into_iter()
|
||||
.map(|(word, node, nweight)| {
|
||||
(
|
||||
word,
|
||||
node,
|
||||
nweight
|
||||
* norm_weight(node.time, tmin, tmax)
|
||||
* norm_weight(node.usage, umin, umax),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
if !user_input_word.is_empty() {
|
||||
let user_input_word: Vec<char> = user_input_word.chars().collect();
|
||||
let mut filtered_nodes = Vec::new();
|
||||
for (word, node, weight) in candidate_nodes {
|
||||
let score_len = std::cmp::min(
|
||||
(word.len() + user_input_word.len()) / 2,
|
||||
user_input_word.len(),
|
||||
) as f64;
|
||||
let score = str_fuzzy_match_live(&word, &user_input_word);
|
||||
if score > 0.0 {
|
||||
let new_weight = 0.95 * (score / score_len) + 0.05 * weight;
|
||||
filtered_nodes.push((word, node, new_weight));
|
||||
}
|
||||
}
|
||||
self.root.children.for_each(&mut |word, node| {
|
||||
let score_len = std::cmp::min(
|
||||
(word.len() + user_input_word.len()) / 2,
|
||||
user_input_word.len(),
|
||||
) as f64;
|
||||
let score = str_fuzzy_match_live(&word, &user_input_word);
|
||||
if score > 0.0 {
|
||||
let new_weight = 0.75 * (score / score_len) + 0.25 * 0.0;
|
||||
filtered_nodes.push((word.to_owned(), node, new_weight));
|
||||
}
|
||||
});
|
||||
candidate_nodes = filtered_nodes;
|
||||
}
|
||||
|
||||
candidate_nodes.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap());
|
||||
|
||||
let mut predictions: HashMap<String, f64> = HashMap::new();
|
||||
for (word, _, weight) in candidate_nodes {
|
||||
predictions
|
||||
.entry(word.iter().collect())
|
||||
.or_insert(weight);
|
||||
}
|
||||
|
||||
let mut predictions_vec: Vec<(String, f64)> = predictions.into_iter().collect();
|
||||
predictions_vec.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
|
||||
|
||||
predictions_vec.into_iter().take(8).collect()
|
||||
}
|
||||
}
|
||||
|
||||
fn norm_weight(x: u64, xmin: u64, xmax: u64) -> f64 {
|
||||
if x <= xmin {
|
||||
return 0.0;
|
||||
}
|
||||
if x >= xmax {
|
||||
return 1.0;
|
||||
}
|
||||
let xnorm = (x - xmin) as f64 / (xmax - xmin) as f64;
|
||||
return 2.0 * xnorm - xnorm.powi(2);
|
||||
}
|
||||
|
||||
fn str_fuzzy_match_live(word: &[char], current_word: &[char]) -> f64 {
|
||||
//let len1 = word.len();
|
||||
let len2 = current_word.len();
|
||||
let mut score = 0.0;
|
||||
let mut penalty: f64 = 0.0;
|
||||
for i in 0..len2 {
|
||||
let ch1 = word.get(i).unwrap_or(&' ');
|
||||
let ch2 = current_word.get(i).unwrap_or(&' ');
|
||||
if ch1 == ch2 {
|
||||
score += 1.0;
|
||||
} else if ch1.to_lowercase().eq(ch2.to_lowercase()) {
|
||||
score += 0.9;
|
||||
} else {
|
||||
penalty += if i == 0 { 2.0 } else { 1.0 };
|
||||
}
|
||||
}
|
||||
return f64::max(0.0, score - 0.125 * penalty.powi(2));
|
||||
}
|
||||
354
libnative/textutils/Cargo.lock
generated
Normal file
354
libnative/textutils/Cargo.lock
generated
Normal file
@@ -0,0 +1,354 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core_maths"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3b02505ccb8c50b0aa21ace0fc08c3e53adebd4e58caa18a36152803c7709a3"
|
||||
dependencies = [
|
||||
"libm",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "displaydoc"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
||||
|
||||
[[package]]
|
||||
name = "icu_collections"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_locid"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"litemap",
|
||||
"tinystr",
|
||||
"writeable",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_provider"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"icu_locid",
|
||||
"icu_provider_macros",
|
||||
"stable_deref_trait",
|
||||
"tinystr",
|
||||
"writeable",
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_provider_macros"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_segmenter"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a717725612346ffc2d7b42c94b820db6908048f39434504cb130e8b46256b0de"
|
||||
dependencies = [
|
||||
"core_maths",
|
||||
"displaydoc",
|
||||
"icu_collections",
|
||||
"icu_locid",
|
||||
"icu_provider",
|
||||
"icu_segmenter_data",
|
||||
"utf8_iter",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_segmenter_data"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f739ee737260d955e330bc83fdeaaf1631f7fb7ed218761d3c04bb13bb7d79df"
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
|
||||
dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
||||
|
||||
[[package]]
|
||||
name = "libm"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
|
||||
|
||||
[[package]]
|
||||
name = "linkify"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f1dfa36d52c581e9ec783a7ce2a5e0143da6237be5811a0b3153fedfdbe9f780"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "litemap"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.88"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c3a7fc5db1e57d5a779a352c8cdb57b29aa4c40cc69c3a68a7fedc815fbf2f9"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.37"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.210"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.210"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "stable_deref_trait"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.79"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "synstructure"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "textutils"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"icu_segmenter",
|
||||
"itertools",
|
||||
"lazy_static",
|
||||
"linkify",
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinystr"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
|
||||
|
||||
[[package]]
|
||||
name = "utf8_iter"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
|
||||
|
||||
[[package]]
|
||||
name = "writeable"
|
||||
version = "0.5.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
|
||||
|
||||
[[package]]
|
||||
name = "yoke"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"stable_deref_trait",
|
||||
"yoke-derive",
|
||||
"zerofrom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "yoke-derive"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"synstructure",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerofrom"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55"
|
||||
dependencies = [
|
||||
"zerofrom-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerofrom-derive"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"synstructure",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerovec"
|
||||
version = "0.10.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
|
||||
dependencies = [
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
"zerovec-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerovec-derive"
|
||||
version = "0.10.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
13
libnative/textutils/Cargo.toml
Normal file
13
libnative/textutils/Cargo.toml
Normal file
@@ -0,0 +1,13 @@
|
||||
[package]
|
||||
name = "textutils"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
icu_segmenter = "1.5.0"
|
||||
itertools = "0.13.0"
|
||||
lazy_static = "1.5.0"
|
||||
linkify = "0.10.0"
|
||||
regex = "1.10.5"
|
||||
20
libnative/textutils/src/filter.rs
Normal file
20
libnative/textutils/src/filter.rs
Normal file
@@ -0,0 +1,20 @@
|
||||
use lazy_static::lazy_static;
|
||||
use linkify::{self, LinkFinder};
|
||||
use regex::Regex;
|
||||
|
||||
lazy_static! {
|
||||
static ref LINK_FINDER: LinkFinder = LinkFinder::new();
|
||||
static ref REDDIT_REGEX: Regex = Regex::new(r"\/?(r\/[a-zA-Z0-9_]{3}[a-zA-Z0-9_]{0,18}|u\/[a-zA-Z0-9_-]{3}[a-zA-Z0-9_-]{0,17})").unwrap();
|
||||
}
|
||||
|
||||
pub fn preprocess_auto(text: &str) -> String {
|
||||
let mut cleaned_text = String::new();
|
||||
let mut begin_cleaned_index = 0;
|
||||
for span in LINK_FINDER.links(text) {
|
||||
cleaned_text.push_str(&text[begin_cleaned_index..span.start()]);
|
||||
begin_cleaned_index = span.end();
|
||||
}
|
||||
cleaned_text.push_str(&text[begin_cleaned_index..]);
|
||||
cleaned_text = REDDIT_REGEX.replace_all(&cleaned_text, "").to_string();
|
||||
return cleaned_text;
|
||||
}
|
||||
52
libnative/textutils/src/lib.rs
Normal file
52
libnative/textutils/src/lib.rs
Normal file
@@ -0,0 +1,52 @@
|
||||
mod filter;
|
||||
mod segment;
|
||||
|
||||
pub use filter::*;
|
||||
pub use segment::*;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use icu_segmenter::{SentenceSegmenter, WordSegmenter};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn segment_sentences_simple() {
|
||||
let text = "Hello, world! How are you? I'm fine.";
|
||||
let segmenter = SentenceSegmenter::new();
|
||||
let sentences = split_sentences(text, &segmenter);
|
||||
assert_eq!(&sentences, &["Hello, world!", "How are you?", "I'm fine."]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn segment_words_simple() {
|
||||
let text = "Hello, world! How are you? I'm fine.";
|
||||
let segmenter = WordSegmenter::new_auto();
|
||||
let words = split_words(text, &segmenter);
|
||||
assert_eq!(&words, &["Hello", "world", "How", "are", "you", "I'm", "fine"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn preprocess_auto_simple() {
|
||||
let text = "Hello, world! How are you? I'm fine. https://example.com and more";
|
||||
let cleaned_text = preprocess_auto(text);
|
||||
assert_eq!(&cleaned_text, "Hello, world! How are you? I'm fine. and more");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn preprocess_reddit_ids() {
|
||||
let text = "have a look at r/cats, user u/example posed a cute cat in there";
|
||||
let cleaned_text = preprocess_auto(text);
|
||||
assert_eq!(&cleaned_text, "have a look at , user posed a cute cat in there");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn preprocess_url_markdown() {
|
||||
let text = "You can find an example [in the documentation](https://example.com) or on GitHub";
|
||||
let cleaned_text = preprocess_auto(text);
|
||||
assert_eq!(&cleaned_text, "You can find an example [in the documentation]() or on GitHub");
|
||||
let segmenter = WordSegmenter::new_auto();
|
||||
let words = split_words(&cleaned_text, &segmenter);
|
||||
assert_eq!(&words, &["You", "can", "find", "an", "example", "in", "the", "documentation", "or", "on", "GitHub"]);
|
||||
}
|
||||
}
|
||||
63
libnative/textutils/src/segment.rs
Normal file
63
libnative/textutils/src/segment.rs
Normal file
@@ -0,0 +1,63 @@
|
||||
use icu_segmenter::{GraphemeClusterSegmenter, SentenceSegmenter, WordSegmenter};
|
||||
use itertools::Itertools;
|
||||
|
||||
pub struct IcuSegmenterCache {
|
||||
sentence_segmenter: SentenceSegmenter,
|
||||
word_segmenter: WordSegmenter,
|
||||
grapheme_cluster_segmenter: GraphemeClusterSegmenter,
|
||||
}
|
||||
|
||||
impl IcuSegmenterCache {
|
||||
pub fn new_auto() -> Self {
|
||||
let sentence_segmenter = SentenceSegmenter::new();
|
||||
let word_segmenter = WordSegmenter::new_auto();
|
||||
let grapheme_cluster_segmenter = GraphemeClusterSegmenter::new();
|
||||
return Self {
|
||||
sentence_segmenter,
|
||||
word_segmenter,
|
||||
grapheme_cluster_segmenter,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn split_sentences<'t>(&self, text: &'t str) -> Vec<&'t str> {
|
||||
return split_sentences(text, &self.sentence_segmenter);
|
||||
}
|
||||
|
||||
pub fn split_words<'t>(&self, text: &'t str) -> Vec<&'t str> {
|
||||
return split_words(text, &self.word_segmenter);
|
||||
}
|
||||
|
||||
pub fn split_grapheme_clusters<'t>(&self, text: &'t str) -> Vec<&'t str> {
|
||||
return split_grapheme_clusters(text, &self.grapheme_cluster_segmenter);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn split_sentences<'t>(text: &'t str, segmenter: &SentenceSegmenter) -> Vec<&'t str> {
|
||||
let sentences: Vec<&str> = segmenter
|
||||
.segment_str(text)
|
||||
.tuple_windows()
|
||||
.map(|(i, j)| text[i..j].trim())
|
||||
.filter(|sentence| !sentence.is_empty())
|
||||
.collect();
|
||||
return sentences;
|
||||
}
|
||||
|
||||
pub fn split_words<'t>(text: &'t str, segmenter: &WordSegmenter) -> Vec<&'t str> {
|
||||
let words: Vec<&str> = segmenter
|
||||
.segment_str(text)
|
||||
.iter_with_word_type()
|
||||
.tuple_windows()
|
||||
.filter(|(_, (_, segment_type))| segment_type.is_word_like())
|
||||
.map(|((i, _), (j, _))| &text[i..j])
|
||||
.collect();
|
||||
return words;
|
||||
}
|
||||
|
||||
pub fn split_grapheme_clusters<'t>(text: &'t str, segmenter: &GraphemeClusterSegmenter) -> Vec<&'t str> {
|
||||
let grapheme_clusters: Vec<&str> = segmenter
|
||||
.segment_str(text)
|
||||
.tuple_windows()
|
||||
.map(|(i, j)| &text[i..j])
|
||||
.collect();
|
||||
return grapheme_clusters;
|
||||
}
|
||||
509
utils/flesttools/Cargo.lock
generated
Normal file
509
utils/flesttools/Cargo.lock
generated
Normal file
@@ -0,0 +1,509 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.1.30"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945"
|
||||
dependencies = [
|
||||
"shlex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core_maths"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3b02505ccb8c50b0aa21ace0fc08c3e53adebd4e58caa18a36152803c7709a3"
|
||||
dependencies = [
|
||||
"libm",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "displaydoc"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
||||
|
||||
[[package]]
|
||||
name = "flest"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"fxhash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "flesttools"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"flest",
|
||||
"pancurses",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"textutils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fxhash"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_collections"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_locid"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"litemap",
|
||||
"tinystr",
|
||||
"writeable",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_provider"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"icu_locid",
|
||||
"icu_provider_macros",
|
||||
"stable_deref_trait",
|
||||
"tinystr",
|
||||
"writeable",
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_provider_macros"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_segmenter"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a717725612346ffc2d7b42c94b820db6908048f39434504cb130e8b46256b0de"
|
||||
dependencies = [
|
||||
"core_maths",
|
||||
"displaydoc",
|
||||
"icu_collections",
|
||||
"icu_locid",
|
||||
"icu_provider",
|
||||
"icu_segmenter_data",
|
||||
"utf8_iter",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_segmenter_data"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f739ee737260d955e330bc83fdeaaf1631f7fb7ed218761d3c04bb13bb7d79df"
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
|
||||
dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.160"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0b21006cd1874ae9e650973c565615676dc4a274c965bb0a73796dac838ce4f"
|
||||
|
||||
[[package]]
|
||||
name = "libm"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
|
||||
|
||||
[[package]]
|
||||
name = "linkify"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f1dfa36d52c581e9ec783a7ce2a5e0143da6237be5811a0b3153fedfdbe9f780"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "litemap"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||
|
||||
[[package]]
|
||||
name = "ncurses"
|
||||
version = "5.101.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e2c5d34d72657dc4b638a1c25d40aae81e4f1c699062f72f467237920752032"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pancurses"
|
||||
version = "0.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0352975c36cbacb9ee99bfb709b9db818bed43af57751797f8633649759d13db"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"log",
|
||||
"ncurses",
|
||||
"pdcurses-sys",
|
||||
"winreg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pdcurses-sys"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "084dd22796ff60f1225d4eb6329f33afaf4c85419d51d440ab6b8c6f4529166b"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pkg-config"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.88"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c3a7fc5db1e57d5a779a352c8cdb57b29aa4c40cc69c3a68a7fedc815fbf2f9"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.37"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.210"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.210"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.129"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6dbcf9b78a125ee667ae19388837dd12294b858d101fdd393cb9d5501ef09eb2"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"memchr",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "shlex"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
||||
|
||||
[[package]]
|
||||
name = "stable_deref_trait"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.79"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "synstructure"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "textutils"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"icu_segmenter",
|
||||
"itertools",
|
||||
"lazy_static",
|
||||
"linkify",
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinystr"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
|
||||
|
||||
[[package]]
|
||||
name = "utf8_iter"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
|
||||
dependencies = [
|
||||
"winapi-i686-pc-windows-gnu",
|
||||
"winapi-x86_64-pc-windows-gnu",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-i686-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
||||
[[package]]
|
||||
name = "winreg"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a27a759395c1195c4cc5cda607ef6f8f6498f64e78f7900f5de0a127a424704a"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "writeable"
|
||||
version = "0.5.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
|
||||
|
||||
[[package]]
|
||||
name = "yoke"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"stable_deref_trait",
|
||||
"yoke-derive",
|
||||
"zerofrom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "yoke-derive"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"synstructure",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerofrom"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55"
|
||||
dependencies = [
|
||||
"zerofrom-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerofrom-derive"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"synstructure",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerovec"
|
||||
version = "0.10.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
|
||||
dependencies = [
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
"zerovec-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerovec-derive"
|
||||
version = "0.10.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
13
utils/flesttools/Cargo.toml
Normal file
13
utils/flesttools/Cargo.toml
Normal file
@@ -0,0 +1,13 @@
|
||||
[package]
|
||||
name = "flesttools"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
flest = { path = "../../libnative/flest" }
|
||||
textutils = { path = "../../libnative/textutils" }
|
||||
pancurses = { version = "0.17.0", features = ["wide"] }
|
||||
serde = "1.0.203"
|
||||
serde_json = "1.0.120"
|
||||
148
utils/flesttools/src/main.rs
Normal file
148
utils/flesttools/src/main.rs
Normal file
@@ -0,0 +1,148 @@
|
||||
use flest::NgramModel;
|
||||
use textutils::IcuSegmenterCache;
|
||||
use pancurses::Input;
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::io::BufRead;
|
||||
use std::io::BufReader;
|
||||
|
||||
const TOKEN_SENTENCE_SEPARATOR: &str = "\\sep";
|
||||
|
||||
fn tokenize_text(text: &str) -> Vec<&str> {
|
||||
let segmenters = IcuSegmenterCache::new_auto();
|
||||
let sentences = segmenters.split_sentences(text);
|
||||
let mut tokens: Vec<&str> = Vec::new();
|
||||
|
||||
tokens.push(TOKEN_SENTENCE_SEPARATOR);
|
||||
for sentence in sentences {
|
||||
let words = segmenters.split_words(sentence);
|
||||
for word in words {
|
||||
tokens.push(word);
|
||||
}
|
||||
tokens.push(TOKEN_SENTENCE_SEPARATOR);
|
||||
}
|
||||
|
||||
//println!("Tokens: {:?}", tokens);
|
||||
return tokens;
|
||||
}
|
||||
|
||||
fn train_model(text: &str, model: &mut NgramModel) {
|
||||
let text = textutils::preprocess_auto(text);
|
||||
let text = text.trim();
|
||||
if text.is_empty() {
|
||||
return;
|
||||
}
|
||||
let tokens = tokenize_text(&text);
|
||||
//println!("Tokens: {:?}", tokens);
|
||||
let n_values = [2, 3, 4];
|
||||
|
||||
for &n in &n_values {
|
||||
if n > tokens.len() {
|
||||
continue;
|
||||
}
|
||||
for i in 0..tokens.len() - n + 1 {
|
||||
model.train_dataset(&tokens[i..(i + n)]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn train_from_plain_text(path: &str, model: &mut NgramModel) {
|
||||
let text = fs::read_to_string(path).expect("Failed to read file");
|
||||
train_model(&text, model);
|
||||
}
|
||||
|
||||
fn train_from_reddit_comments(path: &str, model: &mut NgramModel) {
|
||||
let file = fs::File::open(path).expect("Failed to open file");
|
||||
let reader = BufReader::new(file);
|
||||
let mut line_count = 0;
|
||||
for line in reader.lines() {
|
||||
if let Ok(line) = line {
|
||||
let json: serde_json::Value = serde_json::from_str(&line).expect("Failed to parse JSON");
|
||||
|
||||
if let Some(author) = json.get("author").and_then(|it| it.as_str()) {
|
||||
if author == "AutoModerator" {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if let Some(body) = json.get("body").and_then(|it| it.as_str()) {
|
||||
train_model(body, model);
|
||||
}
|
||||
}
|
||||
line_count += 1;
|
||||
if line_count > 10000 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args: Vec<String> = env::args().collect();
|
||||
if args.len() != 2 {
|
||||
eprintln!("Usage: {} <file_path>", args[0]);
|
||||
return;
|
||||
}
|
||||
|
||||
let path = &args[1];
|
||||
let mut model = NgramModel::default();
|
||||
|
||||
if path.ends_with(".reddit.jsonl") {
|
||||
train_from_reddit_comments(path, &mut model);
|
||||
} else {
|
||||
train_from_plain_text(path, &mut model);
|
||||
}
|
||||
|
||||
let window = pancurses::initscr();
|
||||
let mut input_text = String::new();
|
||||
|
||||
pancurses::noecho();
|
||||
window.keypad(true);
|
||||
loop {
|
||||
let mut words: Vec<&str> = input_text.split_whitespace().collect();
|
||||
words.insert(0, TOKEN_SENTENCE_SEPARATOR);
|
||||
|
||||
if input_text.ends_with(' ') || words.last() == Some(&TOKEN_SENTENCE_SEPARATOR) {
|
||||
words.push("");
|
||||
}
|
||||
|
||||
let predictions = model.predict(&words);
|
||||
|
||||
window.clear();
|
||||
window.addstr("N-gram model debug frontend\n");
|
||||
window.addstr(" demo tokenizer only supports single-line sentence in input text!\n\n");
|
||||
window.addstr(format!("enter text: {}\n", input_text));
|
||||
window.addstr(format!("detected words: {:?}\n\n", words));
|
||||
window.addstr("predictions:\n");
|
||||
for (i, (word, weight)) in predictions.iter().enumerate() {
|
||||
if i == 0 && *weight > 0.9 {
|
||||
window.attron(pancurses::A_BOLD);
|
||||
}
|
||||
window.addstr(format!(" {}. {} (c={:.2})\n", i + 1, word, weight));
|
||||
if i == 0 && *weight > 0.9 {
|
||||
window.attroff(pancurses::A_BOLD);
|
||||
}
|
||||
}
|
||||
if predictions.is_empty() {
|
||||
window.addstr(" (none)\n");
|
||||
}
|
||||
window.mv(3, 12 + input_text.len() as i32);
|
||||
window.refresh();
|
||||
|
||||
match window.getch().unwrap() {
|
||||
Input::KeyF10 => {
|
||||
break
|
||||
}
|
||||
Input::KeyBackspace => {
|
||||
input_text.pop();
|
||||
}
|
||||
Input::Character('\n') => {
|
||||
train_model(&input_text, &mut model)
|
||||
}
|
||||
Input::Character(ch) => {
|
||||
input_text.push(ch)
|
||||
}
|
||||
_ => { () }
|
||||
}
|
||||
}
|
||||
|
||||
pancurses::endwin();
|
||||
}
|
||||
27
utils/setup_vscode_dev_env.sh
Executable file
27
utils/setup_vscode_dev_env.sh
Executable file
@@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
|
||||
WORKSPACE_ROOT_DIR="$(realpath "$(dirname "$0")/..")"
|
||||
VSCODE_DIR="$WORKSPACE_ROOT_DIR/.vscode"
|
||||
VSCODE_SETTINGS_JSON_PATH="$VSCODE_DIR/settings.json"
|
||||
|
||||
if [ "$WORKSPACE_ROOT_DIR" != "$(pwd)" ]; then
|
||||
echo "Not executing this script from workspace root dir!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -d "$VSCODE_DIR" ]; then
|
||||
mkdir "$VSCODE_DIR"
|
||||
fi
|
||||
|
||||
echo -en "{\n" > "$VSCODE_SETTINGS_JSON_PATH"
|
||||
|
||||
# <rust-analyzer>
|
||||
rust_project_paths="$(find "$WORKSPACE_ROOT_DIR" -type f -name "Cargo.toml")"
|
||||
echo -en " \"rust-analyzer.linkedProjects\": [\n" >> "$VSCODE_SETTINGS_JSON_PATH"
|
||||
for rust_project_path in $rust_project_paths; do
|
||||
echo -en " \"$rust_project_path\",\n" >> "$VSCODE_SETTINGS_JSON_PATH"
|
||||
done
|
||||
echo -en " ],\n" >> "$VSCODE_SETTINGS_JSON_PATH"
|
||||
# </rust-analyzer>
|
||||
|
||||
echo -en "}\n" >> "$VSCODE_SETTINGS_JSON_PATH"
|
||||
Reference in New Issue
Block a user