Files
qrust/src/threads/data/backfill.rs
Nikolaos Karaolidis 4b194e168f Add paper URL support
Signed-off-by: Nikolaos Karaolidis <nick@karaolidis.com>
2024-02-14 21:15:27 +00:00

429 lines
13 KiB
Rust

use super::ThreadType;
use crate::{
config::{Config, ALPACA_CRYPTO_DATA_API_URL, ALPACA_STOCK_DATA_API_URL},
database,
types::{
alpaca::{
api,
shared::{Sort, Source},
},
news::Prediction,
Backfill, Bar, Class, News,
},
utils::{duration_until, last_minute, FIFTEEN_MINUTES, ONE_MINUTE, ONE_SECOND},
};
use async_trait::async_trait;
use futures_util::future::join_all;
use log::{error, info, warn};
use std::{collections::HashMap, sync::Arc};
use time::OffsetDateTime;
use tokio::{
spawn,
sync::{mpsc, oneshot, Mutex},
task::{block_in_place, JoinHandle},
time::sleep,
try_join,
};
pub enum Action {
Backfill,
Purge,
}
impl From<super::Action> for Action {
fn from(action: super::Action) -> Self {
match action {
super::Action::Add => Self::Backfill,
super::Action::Remove => Self::Purge,
}
}
}
pub struct Message {
pub action: Action,
pub symbols: Vec<String>,
pub response: oneshot::Sender<()>,
}
impl Message {
pub fn new(action: Action, symbols: Vec<String>) -> (Self, oneshot::Receiver<()>) {
let (sender, receiver) = oneshot::channel::<()>();
(
Self {
action,
symbols,
response: sender,
},
receiver,
)
}
}
#[async_trait]
pub trait Handler: Send + Sync {
async fn select_latest_backfill(
&self,
symbol: String,
) -> Result<Option<Backfill>, clickhouse::error::Error>;
async fn delete_backfills(&self, symbol: &[String]) -> Result<(), clickhouse::error::Error>;
async fn delete_data(&self, symbol: &[String]) -> Result<(), clickhouse::error::Error>;
async fn queue_backfill(&self, symbol: &str, fetch_to: OffsetDateTime);
async fn backfill(&self, symbol: String, fetch_from: OffsetDateTime, fetch_to: OffsetDateTime);
fn log_string(&self) -> &'static str;
}
pub async fn run(handler: Arc<Box<dyn Handler>>, mut receiver: mpsc::Receiver<Message>) {
let backfill_jobs = Arc::new(Mutex::new(HashMap::new()));
loop {
let message = receiver.recv().await.unwrap();
spawn(handle_backfill_message(
handler.clone(),
backfill_jobs.clone(),
message,
));
}
}
async fn handle_backfill_message(
handler: Arc<Box<dyn Handler>>,
backfill_jobs: Arc<Mutex<HashMap<String, JoinHandle<()>>>>,
message: Message,
) {
let mut backfill_jobs = backfill_jobs.lock().await;
match message.action {
Action::Backfill => {
let log_string = handler.log_string();
for symbol in message.symbols {
if let Some(job) = backfill_jobs.get(&symbol) {
if !job.is_finished() {
warn!(
"Backfill for {} {} is already running, skipping.",
symbol, log_string
);
continue;
}
}
let handler = handler.clone();
backfill_jobs.insert(
symbol.clone(),
spawn(async move {
let fetch_from = match handler
.select_latest_backfill(symbol.clone())
.await
.unwrap()
{
Some(latest_backfill) => latest_backfill.time + ONE_SECOND,
None => OffsetDateTime::UNIX_EPOCH,
};
let fetch_to = last_minute();
if fetch_from > fetch_to {
info!("No need to backfill {} {}.", symbol, log_string,);
return;
}
handler.queue_backfill(&symbol, fetch_to).await;
handler.backfill(symbol, fetch_from, fetch_to).await;
}),
);
}
}
Action::Purge => {
for symbol in &message.symbols {
if let Some(job) = backfill_jobs.remove(symbol) {
if !job.is_finished() {
job.abort();
}
let _ = job.await;
}
}
try_join!(
handler.delete_backfills(&message.symbols),
handler.delete_data(&message.symbols)
)
.unwrap();
}
}
message.response.send(()).unwrap();
}
struct BarHandler {
config: Arc<Config>,
data_url: &'static str,
api_query_constructor: fn(
config: &Arc<Config>,
symbol: String,
fetch_from: OffsetDateTime,
fetch_to: OffsetDateTime,
next_page_token: Option<String>,
) -> api::outgoing::bar::Bar,
}
fn us_equity_query_constructor(
config: &Arc<Config>,
symbol: String,
fetch_from: OffsetDateTime,
fetch_to: OffsetDateTime,
next_page_token: Option<String>,
) -> api::outgoing::bar::Bar {
api::outgoing::bar::Bar::UsEquity {
symbols: vec![symbol],
timeframe: ONE_MINUTE,
start: Some(fetch_from),
end: Some(fetch_to),
limit: Some(10000),
adjustment: None,
asof: None,
feed: Some(config.alpaca_source),
currency: None,
page_token: next_page_token,
sort: Some(Sort::Asc),
}
}
fn crypto_query_constructor(
_: &Arc<Config>,
symbol: String,
fetch_from: OffsetDateTime,
fetch_to: OffsetDateTime,
next_page_token: Option<String>,
) -> api::outgoing::bar::Bar {
api::outgoing::bar::Bar::Crypto {
symbols: vec![symbol],
timeframe: ONE_MINUTE,
start: Some(fetch_from),
end: Some(fetch_to),
limit: Some(10000),
page_token: next_page_token,
sort: Some(Sort::Asc),
}
}
#[async_trait]
impl Handler for BarHandler {
async fn select_latest_backfill(
&self,
symbol: String,
) -> Result<Option<Backfill>, clickhouse::error::Error> {
database::backfills_bars::select_where_symbol(&self.config.clickhouse_client, &symbol).await
}
async fn delete_backfills(&self, symbols: &[String]) -> Result<(), clickhouse::error::Error> {
database::backfills_bars::delete_where_symbols(&self.config.clickhouse_client, symbols)
.await
}
async fn delete_data(&self, symbols: &[String]) -> Result<(), clickhouse::error::Error> {
database::bars::delete_where_symbols(&self.config.clickhouse_client, symbols).await
}
async fn queue_backfill(&self, symbol: &str, fetch_to: OffsetDateTime) {
if self.config.alpaca_source == Source::Iex {
let run_delay = duration_until(fetch_to + FIFTEEN_MINUTES + ONE_MINUTE);
info!("Queing bar backfill for {} in {:?}.", symbol, run_delay);
sleep(run_delay).await;
}
}
async fn backfill(&self, symbol: String, fetch_from: OffsetDateTime, fetch_to: OffsetDateTime) {
info!("Backfilling bars for {}.", symbol);
let mut bars = vec![];
let mut next_page_token = None;
loop {
let Ok(message) = api::incoming::bar::get_historical(
&self.config,
self.data_url,
&(self.api_query_constructor)(
&self.config,
symbol.clone(),
fetch_from,
fetch_to,
next_page_token.clone(),
),
None,
)
.await
else {
error!("Failed to backfill bars for {}.", symbol);
return;
};
message.bars.into_iter().for_each(|(symbol, bar_vec)| {
for bar in bar_vec {
bars.push(Bar::from((bar, symbol.clone())));
}
});
if message.next_page_token.is_none() {
break;
}
next_page_token = message.next_page_token;
}
if bars.is_empty() {
info!("No bars to backfill for {}.", symbol);
return;
}
let backfill = bars.last().unwrap().clone().into();
database::bars::upsert_batch(&self.config.clickhouse_client, &bars)
.await
.unwrap();
database::backfills_bars::upsert(&self.config.clickhouse_client, &backfill)
.await
.unwrap();
info!("Backfilled bars for {}.", symbol);
}
fn log_string(&self) -> &'static str {
"bars"
}
}
struct NewsHandler {
config: Arc<Config>,
}
#[async_trait]
impl Handler for NewsHandler {
async fn select_latest_backfill(
&self,
symbol: String,
) -> Result<Option<Backfill>, clickhouse::error::Error> {
database::backfills_news::select_where_symbol(&self.config.clickhouse_client, &symbol).await
}
async fn delete_backfills(&self, symbols: &[String]) -> Result<(), clickhouse::error::Error> {
database::backfills_news::delete_where_symbols(&self.config.clickhouse_client, symbols)
.await
}
async fn delete_data(&self, symbols: &[String]) -> Result<(), clickhouse::error::Error> {
database::news::delete_where_symbols(&self.config.clickhouse_client, symbols).await
}
async fn queue_backfill(&self, symbol: &str, fetch_to: OffsetDateTime) {
let run_delay = duration_until(fetch_to + FIFTEEN_MINUTES + ONE_MINUTE);
info!("Queing news backfill for {} in {:?}.", symbol, run_delay);
sleep(run_delay).await;
}
async fn backfill(&self, symbol: String, fetch_from: OffsetDateTime, fetch_to: OffsetDateTime) {
info!("Backfilling news for {}.", symbol);
let mut news = vec![];
let mut next_page_token = None;
loop {
let Ok(message) = api::incoming::news::get_historical(
&self.config,
&api::outgoing::news::News {
symbols: vec![symbol.clone()],
start: Some(fetch_from),
end: Some(fetch_to),
limit: Some(50),
include_content: Some(true),
exclude_contentless: Some(false),
page_token: next_page_token.clone(),
sort: Some(Sort::Asc),
},
None,
)
.await
else {
error!("Failed to backfill news for {}.", symbol);
return;
};
message.news.into_iter().for_each(|news_item| {
news.push(News::from(news_item));
});
if message.next_page_token.is_none() {
break;
}
next_page_token = message.next_page_token;
}
if news.is_empty() {
info!("No news to backfill for {}.", symbol);
return;
}
let inputs = news
.iter()
.map(|news| format!("{}\n\n{}", news.headline, news.content))
.collect::<Vec<_>>();
let predictions = join_all(inputs.chunks(self.config.max_bert_inputs).map(|inputs| {
let sequence_classifier = self.config.sequence_classifier.clone();
async move {
let sequence_classifier = sequence_classifier.lock().await;
block_in_place(|| {
sequence_classifier
.predict(inputs.iter().map(String::as_str).collect::<Vec<_>>())
.into_iter()
.map(|label| Prediction::try_from(label).unwrap())
.collect::<Vec<_>>()
})
}
}))
.await
.into_iter()
.flatten();
let news = news
.into_iter()
.zip(predictions)
.map(|(news, prediction)| News {
sentiment: prediction.sentiment,
confidence: prediction.confidence,
..news
})
.collect::<Vec<_>>();
let backfill = (news.last().unwrap().clone(), symbol.clone()).into();
database::news::upsert_batch(&self.config.clickhouse_client, &news)
.await
.unwrap();
database::backfills_news::upsert(&self.config.clickhouse_client, &backfill)
.await
.unwrap();
info!("Backfilled news for {}.", symbol);
}
fn log_string(&self) -> &'static str {
"news"
}
}
pub fn create_handler(thread_type: ThreadType, config: Arc<Config>) -> Box<dyn Handler> {
match thread_type {
ThreadType::Bars(Class::UsEquity) => Box::new(BarHandler {
config,
data_url: ALPACA_STOCK_DATA_API_URL,
api_query_constructor: us_equity_query_constructor,
}),
ThreadType::Bars(Class::Crypto) => Box::new(BarHandler {
config,
data_url: ALPACA_CRYPTO_DATA_API_URL,
api_query_constructor: crypto_query_constructor,
}),
ThreadType::News => Box::new(NewsHandler { config }),
}
}