Add news content normalization and storing

Signed-off-by: Nikolaos Karaolidis <nick@karaolidis.com>
This commit is contained in:
2024-01-26 13:26:52 +00:00
parent 5f458d9ea9
commit 2829bb2970
8 changed files with 111 additions and 19 deletions

View File

@@ -1,4 +1,4 @@
use crate::types;
use crate::{types, utils::normalize_news_content};
use serde::{Deserialize, Serialize};
use serde_with::serde_as;
use time::OffsetDateTime;
@@ -28,8 +28,10 @@ pub struct News {
#[serde(rename = "updated_at")]
pub time_updated: OffsetDateTime,
pub symbols: Vec<String>,
pub headline: String,
pub author: String,
#[serde_as(as = "NoneAsEmptyString")]
pub headline: Option<String>,
#[serde_as(as = "NoneAsEmptyString")]
pub author: Option<String>,
#[serde_as(as = "NoneAsEmptyString")]
pub source: Option<String>,
#[serde_as(as = "NoneAsEmptyString")]
@@ -48,10 +50,11 @@ impl From<News> for types::News {
time_created: news.time_created,
time_updated: news.time_updated,
symbols: news.symbols,
headline: news.headline,
author: news.author,
source: news.source,
summary: news.summary,
headline: normalize_news_content(news.headline),
author: normalize_news_content(news.author),
source: normalize_news_content(news.source),
summary: normalize_news_content(news.summary),
content: normalize_news_content(news.content),
url: news.url,
}
}

View File

@@ -1,4 +1,4 @@
use crate::types;
use crate::{types, utils::normalize_news_content};
use serde::{Deserialize, Serialize};
use serde_with::serde_as;
use time::OffsetDateTime;
@@ -14,8 +14,10 @@ pub struct Message {
#[serde(rename = "updated_at")]
pub time_updated: OffsetDateTime,
pub symbols: Vec<String>,
pub headline: String,
pub author: String,
#[serde_as(as = "NoneAsEmptyString")]
pub headline: Option<String>,
#[serde_as(as = "NoneAsEmptyString")]
pub author: Option<String>,
#[serde_as(as = "NoneAsEmptyString")]
pub source: Option<String>,
#[serde_as(as = "NoneAsEmptyString")]
@@ -33,10 +35,11 @@ impl From<Message> for types::News {
time_created: news.time_created,
time_updated: news.time_updated,
symbols: news.symbols,
headline: news.headline,
author: news.author,
source: news.source,
summary: news.summary,
headline: normalize_news_content(news.headline),
author: normalize_news_content(news.author),
source: normalize_news_content(news.source),
summary: normalize_news_content(news.summary),
content: normalize_news_content(news.content),
url: news.url,
}
}

View File

@@ -12,12 +12,16 @@ pub struct News {
#[serde(with = "clickhouse::serde::time::datetime")]
pub time_updated: OffsetDateTime,
pub symbols: Vec<String>,
pub headline: String,
pub author: String,
#[serde_as(as = "NoneAsEmptyString")]
pub headline: Option<String>,
#[serde_as(as = "NoneAsEmptyString")]
pub author: Option<String>,
#[serde_as(as = "NoneAsEmptyString")]
pub source: Option<String>,
#[serde_as(as = "NoneAsEmptyString")]
pub summary: Option<String>,
#[serde_as(as = "NoneAsEmptyString")]
pub content: Option<String>,
#[serde_as(as = "NoneAsEmptyString")]
pub url: Option<String>,
}

View File

@@ -1,7 +1,9 @@
pub mod cleanup;
pub mod news;
pub mod time;
pub mod websocket;
pub use cleanup::cleanup;
pub use news::normalize_news_content;
pub use time::{duration_until, last_minute, FIFTEEN_MINUTES, ONE_MINUTE};
pub use websocket::authenticate;

22
src/utils/news.rs Normal file
View File

@@ -0,0 +1,22 @@
use html_escape::decode_html_entities;
use regex::Regex;
pub fn normalize_news_content(content: Option<String>) -> Option<String> {
content.as_ref()?;
let content = content.unwrap();
let re_tags = Regex::new("<[^>]+>").unwrap();
let re_spaces = Regex::new("[\\u00A0\\s]+").unwrap();
let content = content.replace('\n', " ");
let content = re_tags.replace_all(&content, "");
let content = re_spaces.replace_all(&content, " ");
let content = decode_html_entities(&content);
let content = content.trim();
if content.is_empty() {
None
} else {
Some(content.to_string())
}
}