Add news content normalization and storing
Signed-off-by: Nikolaos Karaolidis <nick@karaolidis.com>
This commit is contained in:
55
Cargo.lock
generated
55
Cargo.lock
generated
@@ -17,6 +17,15 @@ version = "1.0.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aho-corasick"
|
||||||
|
version = "1.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "android-tzdata"
|
name = "android-tzdata"
|
||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
@@ -686,6 +695,15 @@ version = "0.4.3"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
|
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "html-escape"
|
||||||
|
version = "0.2.13"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
|
||||||
|
dependencies = [
|
||||||
|
"utf8-width",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "http"
|
name = "http"
|
||||||
version = "0.2.11"
|
version = "0.2.11"
|
||||||
@@ -1282,9 +1300,11 @@ dependencies = [
|
|||||||
"dotenv",
|
"dotenv",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"governor",
|
"governor",
|
||||||
|
"html-escape",
|
||||||
"http 1.0.0",
|
"http 1.0.0",
|
||||||
"log",
|
"log",
|
||||||
"log4rs",
|
"log4rs",
|
||||||
|
"regex",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@@ -1369,6 +1389,35 @@ dependencies = [
|
|||||||
"bitflags 1.3.2",
|
"bitflags 1.3.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex"
|
||||||
|
version = "1.10.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-automata",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-automata"
|
||||||
|
version = "0.4.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-syntax"
|
||||||
|
version = "0.8.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "reqwest"
|
name = "reqwest"
|
||||||
version = "0.11.23"
|
version = "0.11.23"
|
||||||
@@ -2017,6 +2066,12 @@ version = "0.7.6"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "utf8-width"
|
||||||
|
version = "0.1.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "uuid"
|
name = "uuid"
|
||||||
version = "1.7.0"
|
version = "1.7.0"
|
||||||
|
@@ -50,3 +50,5 @@ time = { version = "0.3.31", features = [
|
|||||||
backoff = { version = "0.4.0", features = [
|
backoff = { version = "0.4.0", features = [
|
||||||
"tokio",
|
"tokio",
|
||||||
] }
|
] }
|
||||||
|
regex = "1.10.3"
|
||||||
|
html-escape = "0.2.13"
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
use crate::types;
|
use crate::{types, utils::normalize_news_content};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_with::serde_as;
|
use serde_with::serde_as;
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
@@ -28,8 +28,10 @@ pub struct News {
|
|||||||
#[serde(rename = "updated_at")]
|
#[serde(rename = "updated_at")]
|
||||||
pub time_updated: OffsetDateTime,
|
pub time_updated: OffsetDateTime,
|
||||||
pub symbols: Vec<String>,
|
pub symbols: Vec<String>,
|
||||||
pub headline: String,
|
#[serde_as(as = "NoneAsEmptyString")]
|
||||||
pub author: String,
|
pub headline: Option<String>,
|
||||||
|
#[serde_as(as = "NoneAsEmptyString")]
|
||||||
|
pub author: Option<String>,
|
||||||
#[serde_as(as = "NoneAsEmptyString")]
|
#[serde_as(as = "NoneAsEmptyString")]
|
||||||
pub source: Option<String>,
|
pub source: Option<String>,
|
||||||
#[serde_as(as = "NoneAsEmptyString")]
|
#[serde_as(as = "NoneAsEmptyString")]
|
||||||
@@ -48,10 +50,11 @@ impl From<News> for types::News {
|
|||||||
time_created: news.time_created,
|
time_created: news.time_created,
|
||||||
time_updated: news.time_updated,
|
time_updated: news.time_updated,
|
||||||
symbols: news.symbols,
|
symbols: news.symbols,
|
||||||
headline: news.headline,
|
headline: normalize_news_content(news.headline),
|
||||||
author: news.author,
|
author: normalize_news_content(news.author),
|
||||||
source: news.source,
|
source: normalize_news_content(news.source),
|
||||||
summary: news.summary,
|
summary: normalize_news_content(news.summary),
|
||||||
|
content: normalize_news_content(news.content),
|
||||||
url: news.url,
|
url: news.url,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
use crate::types;
|
use crate::{types, utils::normalize_news_content};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_with::serde_as;
|
use serde_with::serde_as;
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
@@ -14,8 +14,10 @@ pub struct Message {
|
|||||||
#[serde(rename = "updated_at")]
|
#[serde(rename = "updated_at")]
|
||||||
pub time_updated: OffsetDateTime,
|
pub time_updated: OffsetDateTime,
|
||||||
pub symbols: Vec<String>,
|
pub symbols: Vec<String>,
|
||||||
pub headline: String,
|
#[serde_as(as = "NoneAsEmptyString")]
|
||||||
pub author: String,
|
pub headline: Option<String>,
|
||||||
|
#[serde_as(as = "NoneAsEmptyString")]
|
||||||
|
pub author: Option<String>,
|
||||||
#[serde_as(as = "NoneAsEmptyString")]
|
#[serde_as(as = "NoneAsEmptyString")]
|
||||||
pub source: Option<String>,
|
pub source: Option<String>,
|
||||||
#[serde_as(as = "NoneAsEmptyString")]
|
#[serde_as(as = "NoneAsEmptyString")]
|
||||||
@@ -33,10 +35,11 @@ impl From<Message> for types::News {
|
|||||||
time_created: news.time_created,
|
time_created: news.time_created,
|
||||||
time_updated: news.time_updated,
|
time_updated: news.time_updated,
|
||||||
symbols: news.symbols,
|
symbols: news.symbols,
|
||||||
headline: news.headline,
|
headline: normalize_news_content(news.headline),
|
||||||
author: news.author,
|
author: normalize_news_content(news.author),
|
||||||
source: news.source,
|
source: normalize_news_content(news.source),
|
||||||
summary: news.summary,
|
summary: normalize_news_content(news.summary),
|
||||||
|
content: normalize_news_content(news.content),
|
||||||
url: news.url,
|
url: news.url,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -12,12 +12,16 @@ pub struct News {
|
|||||||
#[serde(with = "clickhouse::serde::time::datetime")]
|
#[serde(with = "clickhouse::serde::time::datetime")]
|
||||||
pub time_updated: OffsetDateTime,
|
pub time_updated: OffsetDateTime,
|
||||||
pub symbols: Vec<String>,
|
pub symbols: Vec<String>,
|
||||||
pub headline: String,
|
#[serde_as(as = "NoneAsEmptyString")]
|
||||||
pub author: String,
|
pub headline: Option<String>,
|
||||||
|
#[serde_as(as = "NoneAsEmptyString")]
|
||||||
|
pub author: Option<String>,
|
||||||
#[serde_as(as = "NoneAsEmptyString")]
|
#[serde_as(as = "NoneAsEmptyString")]
|
||||||
pub source: Option<String>,
|
pub source: Option<String>,
|
||||||
#[serde_as(as = "NoneAsEmptyString")]
|
#[serde_as(as = "NoneAsEmptyString")]
|
||||||
pub summary: Option<String>,
|
pub summary: Option<String>,
|
||||||
#[serde_as(as = "NoneAsEmptyString")]
|
#[serde_as(as = "NoneAsEmptyString")]
|
||||||
|
pub content: Option<String>,
|
||||||
|
#[serde_as(as = "NoneAsEmptyString")]
|
||||||
pub url: Option<String>,
|
pub url: Option<String>,
|
||||||
}
|
}
|
||||||
|
@@ -1,7 +1,9 @@
|
|||||||
pub mod cleanup;
|
pub mod cleanup;
|
||||||
|
pub mod news;
|
||||||
pub mod time;
|
pub mod time;
|
||||||
pub mod websocket;
|
pub mod websocket;
|
||||||
|
|
||||||
pub use cleanup::cleanup;
|
pub use cleanup::cleanup;
|
||||||
|
pub use news::normalize_news_content;
|
||||||
pub use time::{duration_until, last_minute, FIFTEEN_MINUTES, ONE_MINUTE};
|
pub use time::{duration_until, last_minute, FIFTEEN_MINUTES, ONE_MINUTE};
|
||||||
pub use websocket::authenticate;
|
pub use websocket::authenticate;
|
||||||
|
22
src/utils/news.rs
Normal file
22
src/utils/news.rs
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
use html_escape::decode_html_entities;
|
||||||
|
use regex::Regex;
|
||||||
|
|
||||||
|
pub fn normalize_news_content(content: Option<String>) -> Option<String> {
|
||||||
|
content.as_ref()?;
|
||||||
|
let content = content.unwrap();
|
||||||
|
|
||||||
|
let re_tags = Regex::new("<[^>]+>").unwrap();
|
||||||
|
let re_spaces = Regex::new("[\\u00A0\\s]+").unwrap();
|
||||||
|
|
||||||
|
let content = content.replace('\n', " ");
|
||||||
|
let content = re_tags.replace_all(&content, "");
|
||||||
|
let content = re_spaces.replace_all(&content, " ");
|
||||||
|
let content = decode_html_entities(&content);
|
||||||
|
let content = content.trim();
|
||||||
|
|
||||||
|
if content.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(content.to_string())
|
||||||
|
}
|
||||||
|
}
|
@@ -47,9 +47,10 @@ CREATE TABLE IF NOT EXISTS qrust.news (
|
|||||||
symbols Array(LowCardinality(String)),
|
symbols Array(LowCardinality(String)),
|
||||||
headline String,
|
headline String,
|
||||||
author String,
|
author String,
|
||||||
source Nullable(String),
|
source String,
|
||||||
summary Nullable(String),
|
summary String,
|
||||||
url Nullable(String),
|
content String,
|
||||||
|
url String,
|
||||||
INDEX index_symbols symbols TYPE bloom_filter()
|
INDEX index_symbols symbols TYPE bloom_filter()
|
||||||
)
|
)
|
||||||
ENGINE = ReplacingMergeTree()
|
ENGINE = ReplacingMergeTree()
|
||||||
|
Reference in New Issue
Block a user