Add news content normalization and storing

Signed-off-by: Nikolaos Karaolidis <nick@karaolidis.com>
This commit is contained in:
2024-01-26 13:26:52 +00:00
parent 5f458d9ea9
commit 2829bb2970
8 changed files with 111 additions and 19 deletions

55
Cargo.lock generated
View File

@@ -17,6 +17,15 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "aho-corasick"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "android-tzdata" name = "android-tzdata"
version = "0.1.1" version = "0.1.1"
@@ -686,6 +695,15 @@ version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "html-escape"
version = "0.2.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
dependencies = [
"utf8-width",
]
[[package]] [[package]]
name = "http" name = "http"
version = "0.2.11" version = "0.2.11"
@@ -1282,9 +1300,11 @@ dependencies = [
"dotenv", "dotenv",
"futures-util", "futures-util",
"governor", "governor",
"html-escape",
"http 1.0.0", "http 1.0.0",
"log", "log",
"log4rs", "log4rs",
"regex",
"reqwest", "reqwest",
"serde", "serde",
"serde_json", "serde_json",
@@ -1369,6 +1389,35 @@ dependencies = [
"bitflags 1.3.2", "bitflags 1.3.2",
] ]
[[package]]
name = "regex"
version = "1.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
[[package]] [[package]]
name = "reqwest" name = "reqwest"
version = "0.11.23" version = "0.11.23"
@@ -2017,6 +2066,12 @@ version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "utf8-width"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3"
[[package]] [[package]]
name = "uuid" name = "uuid"
version = "1.7.0" version = "1.7.0"

View File

@@ -50,3 +50,5 @@ time = { version = "0.3.31", features = [
backoff = { version = "0.4.0", features = [ backoff = { version = "0.4.0", features = [
"tokio", "tokio",
] } ] }
regex = "1.10.3"
html-escape = "0.2.13"

View File

@@ -1,4 +1,4 @@
use crate::types; use crate::{types, utils::normalize_news_content};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_with::serde_as; use serde_with::serde_as;
use time::OffsetDateTime; use time::OffsetDateTime;
@@ -28,8 +28,10 @@ pub struct News {
#[serde(rename = "updated_at")] #[serde(rename = "updated_at")]
pub time_updated: OffsetDateTime, pub time_updated: OffsetDateTime,
pub symbols: Vec<String>, pub symbols: Vec<String>,
pub headline: String, #[serde_as(as = "NoneAsEmptyString")]
pub author: String, pub headline: Option<String>,
#[serde_as(as = "NoneAsEmptyString")]
pub author: Option<String>,
#[serde_as(as = "NoneAsEmptyString")] #[serde_as(as = "NoneAsEmptyString")]
pub source: Option<String>, pub source: Option<String>,
#[serde_as(as = "NoneAsEmptyString")] #[serde_as(as = "NoneAsEmptyString")]
@@ -48,10 +50,11 @@ impl From<News> for types::News {
time_created: news.time_created, time_created: news.time_created,
time_updated: news.time_updated, time_updated: news.time_updated,
symbols: news.symbols, symbols: news.symbols,
headline: news.headline, headline: normalize_news_content(news.headline),
author: news.author, author: normalize_news_content(news.author),
source: news.source, source: normalize_news_content(news.source),
summary: news.summary, summary: normalize_news_content(news.summary),
content: normalize_news_content(news.content),
url: news.url, url: news.url,
} }
} }

View File

@@ -1,4 +1,4 @@
use crate::types; use crate::{types, utils::normalize_news_content};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_with::serde_as; use serde_with::serde_as;
use time::OffsetDateTime; use time::OffsetDateTime;
@@ -14,8 +14,10 @@ pub struct Message {
#[serde(rename = "updated_at")] #[serde(rename = "updated_at")]
pub time_updated: OffsetDateTime, pub time_updated: OffsetDateTime,
pub symbols: Vec<String>, pub symbols: Vec<String>,
pub headline: String, #[serde_as(as = "NoneAsEmptyString")]
pub author: String, pub headline: Option<String>,
#[serde_as(as = "NoneAsEmptyString")]
pub author: Option<String>,
#[serde_as(as = "NoneAsEmptyString")] #[serde_as(as = "NoneAsEmptyString")]
pub source: Option<String>, pub source: Option<String>,
#[serde_as(as = "NoneAsEmptyString")] #[serde_as(as = "NoneAsEmptyString")]
@@ -33,10 +35,11 @@ impl From<Message> for types::News {
time_created: news.time_created, time_created: news.time_created,
time_updated: news.time_updated, time_updated: news.time_updated,
symbols: news.symbols, symbols: news.symbols,
headline: news.headline, headline: normalize_news_content(news.headline),
author: news.author, author: normalize_news_content(news.author),
source: news.source, source: normalize_news_content(news.source),
summary: news.summary, summary: normalize_news_content(news.summary),
content: normalize_news_content(news.content),
url: news.url, url: news.url,
} }
} }

View File

@@ -12,12 +12,16 @@ pub struct News {
#[serde(with = "clickhouse::serde::time::datetime")] #[serde(with = "clickhouse::serde::time::datetime")]
pub time_updated: OffsetDateTime, pub time_updated: OffsetDateTime,
pub symbols: Vec<String>, pub symbols: Vec<String>,
pub headline: String, #[serde_as(as = "NoneAsEmptyString")]
pub author: String, pub headline: Option<String>,
#[serde_as(as = "NoneAsEmptyString")]
pub author: Option<String>,
#[serde_as(as = "NoneAsEmptyString")] #[serde_as(as = "NoneAsEmptyString")]
pub source: Option<String>, pub source: Option<String>,
#[serde_as(as = "NoneAsEmptyString")] #[serde_as(as = "NoneAsEmptyString")]
pub summary: Option<String>, pub summary: Option<String>,
#[serde_as(as = "NoneAsEmptyString")] #[serde_as(as = "NoneAsEmptyString")]
pub content: Option<String>,
#[serde_as(as = "NoneAsEmptyString")]
pub url: Option<String>, pub url: Option<String>,
} }

View File

@@ -1,7 +1,9 @@
pub mod cleanup; pub mod cleanup;
pub mod news;
pub mod time; pub mod time;
pub mod websocket; pub mod websocket;
pub use cleanup::cleanup; pub use cleanup::cleanup;
pub use news::normalize_news_content;
pub use time::{duration_until, last_minute, FIFTEEN_MINUTES, ONE_MINUTE}; pub use time::{duration_until, last_minute, FIFTEEN_MINUTES, ONE_MINUTE};
pub use websocket::authenticate; pub use websocket::authenticate;

22
src/utils/news.rs Normal file
View File

@@ -0,0 +1,22 @@
use html_escape::decode_html_entities;
use regex::Regex;
pub fn normalize_news_content(content: Option<String>) -> Option<String> {
content.as_ref()?;
let content = content.unwrap();
let re_tags = Regex::new("<[^>]+>").unwrap();
let re_spaces = Regex::new("[\\u00A0\\s]+").unwrap();
let content = content.replace('\n', " ");
let content = re_tags.replace_all(&content, "");
let content = re_spaces.replace_all(&content, " ");
let content = decode_html_entities(&content);
let content = content.trim();
if content.is_empty() {
None
} else {
Some(content.to_string())
}
}

View File

@@ -47,9 +47,10 @@ CREATE TABLE IF NOT EXISTS qrust.news (
symbols Array(LowCardinality(String)), symbols Array(LowCardinality(String)),
headline String, headline String,
author String, author String,
source Nullable(String), source String,
summary Nullable(String), summary String,
url Nullable(String), content String,
url String,
INDEX index_symbols symbols TYPE bloom_filter() INDEX index_symbols symbols TYPE bloom_filter()
) )
ENGINE = ReplacingMergeTree() ENGINE = ReplacingMergeTree()