Add pipelined backfilling

Signed-off-by: Nikolaos Karaolidis <nick@karaolidis.com>
This commit is contained in:
2024-03-10 11:22:24 +00:00
parent 681d7393d7
commit acfc0ca4c9
7 changed files with 258 additions and 105 deletions

View File

@@ -1,13 +1,13 @@
use std::sync::Arc; use std::sync::Arc;
use crate::{ use crate::{
cleanup, delete_where_symbols, optimize, select_where_symbols, types::Backfill, upsert, cleanup, delete_where_symbols, optimize, select_where_symbols, types::Backfill, upsert_batch,
}; };
use clickhouse::{error::Error, Client}; use clickhouse::{error::Error, Client};
use tokio::sync::Semaphore; use tokio::sync::Semaphore;
select_where_symbols!(Backfill, "backfills_bars"); select_where_symbols!(Backfill, "backfills_bars");
upsert!(Backfill, "backfills_bars"); upsert_batch!(Backfill, "backfills_bars");
delete_where_symbols!("backfills_bars"); delete_where_symbols!("backfills_bars");
cleanup!("backfills_bars"); cleanup!("backfills_bars");
optimize!("backfills_bars"); optimize!("backfills_bars");

View File

@@ -1,13 +1,13 @@
use std::sync::Arc; use std::sync::Arc;
use crate::{ use crate::{
cleanup, delete_where_symbols, optimize, select_where_symbols, types::Backfill, upsert, cleanup, delete_where_symbols, optimize, select_where_symbols, types::Backfill, upsert_batch,
}; };
use clickhouse::{error::Error, Client}; use clickhouse::{error::Error, Client};
use tokio::sync::Semaphore; use tokio::sync::Semaphore;
select_where_symbols!(Backfill, "backfills_news"); select_where_symbols!(Backfill, "backfills_news");
upsert!(Backfill, "backfills_news"); upsert_batch!(Backfill, "backfills_news");
delete_where_symbols!("backfills_news"); delete_where_symbols!("backfills_news");
cleanup!("backfills_news"); cleanup!("backfills_news");
optimize!("backfills_news"); optimize!("backfills_news");

View File

@@ -4,6 +4,8 @@ use crate::{delete_where_symbols, optimize, types::Bar, upsert, upsert_batch};
use clickhouse::Client; use clickhouse::Client;
use tokio::sync::Semaphore; use tokio::sync::Semaphore;
pub const BATCH_FLUSH_SIZE: usize = 100_000;
upsert!(Bar, "bars"); upsert!(Bar, "bars");
upsert_batch!(Bar, "bars"); upsert_batch!(Bar, "bars");
delete_where_symbols!("bars"); delete_where_symbols!("bars");

View File

@@ -5,6 +5,8 @@ use clickhouse::{error::Error, Client};
use serde::Serialize; use serde::Serialize;
use tokio::sync::Semaphore; use tokio::sync::Semaphore;
pub const BATCH_FLUSH_SIZE: usize = 500;
upsert!(News, "news"); upsert!(News, "news");
upsert_batch!(News, "news"); upsert_batch!(News, "news");
optimize!("news"); optimize!("news");

View File

@@ -6,7 +6,10 @@ use crate::{
}, },
database, database,
types::{ types::{
alpaca::{self, shared::Source}, alpaca::{
self,
shared::{Sort, Source},
},
news::Prediction, news::Prediction,
Backfill, Bar, Class, News, Backfill, Bar, Class, News,
}, },
@@ -14,6 +17,7 @@ use crate::{
}; };
use async_trait::async_trait; use async_trait::async_trait;
use futures_util::future::join_all; use futures_util::future::join_all;
use itertools::{Either, Itertools};
use log::{error, info, warn}; use log::{error, info, warn};
use std::{collections::HashMap, sync::Arc}; use std::{collections::HashMap, sync::Arc};
use time::OffsetDateTime; use time::OffsetDateTime;
@@ -24,6 +28,7 @@ use tokio::{
time::sleep, time::sleep,
try_join, try_join,
}; };
use uuid::Uuid;
pub enum Action { pub enum Action {
Backfill, Backfill,
@@ -50,6 +55,12 @@ impl Message {
} }
} }
#[derive(Clone)]
pub struct Job {
pub fetch_from: OffsetDateTime,
pub fetch_to: OffsetDateTime,
}
#[async_trait] #[async_trait]
pub trait Handler: Send + Sync { pub trait Handler: Send + Sync {
async fn select_latest_backfills( async fn select_latest_backfills(
@@ -58,13 +69,44 @@ pub trait Handler: Send + Sync {
) -> Result<Vec<Backfill>, clickhouse::error::Error>; ) -> Result<Vec<Backfill>, clickhouse::error::Error>;
async fn delete_backfills(&self, symbol: &[String]) -> Result<(), clickhouse::error::Error>; async fn delete_backfills(&self, symbol: &[String]) -> Result<(), clickhouse::error::Error>;
async fn delete_data(&self, symbol: &[String]) -> Result<(), clickhouse::error::Error>; async fn delete_data(&self, symbol: &[String]) -> Result<(), clickhouse::error::Error>;
async fn queue_backfill(&self, symbol: &str, fetch_to: OffsetDateTime); async fn queue_backfill(&self, jobs: &HashMap<String, Job>);
async fn backfill(&self, symbol: String, fetch_from: OffsetDateTime, fetch_to: OffsetDateTime); async fn backfill(&self, jobs: HashMap<String, Job>);
fn max_limit(&self) -> i64;
fn log_string(&self) -> &'static str; fn log_string(&self) -> &'static str;
} }
pub struct Jobs {
pub symbol_to_uuid: HashMap<String, Uuid>,
pub uuid_to_job: HashMap<Uuid, JoinHandle<()>>,
}
impl Jobs {
pub fn insert(&mut self, jobs: Vec<String>, fut: JoinHandle<()>) {
let uuid = Uuid::new_v4();
for symbol in jobs {
self.symbol_to_uuid.insert(symbol.clone(), uuid);
}
self.uuid_to_job.insert(uuid, fut);
}
pub fn get(&self, symbol: &str) -> Option<&JoinHandle<()>> {
self.symbol_to_uuid
.get(symbol)
.and_then(|uuid| self.uuid_to_job.get(uuid))
}
pub fn remove(&mut self, symbol: &str) -> Option<JoinHandle<()>> {
self.symbol_to_uuid
.remove(symbol)
.and_then(|uuid| self.uuid_to_job.remove(&uuid))
}
}
pub async fn run(handler: Arc<Box<dyn Handler>>, mut receiver: mpsc::Receiver<Message>) { pub async fn run(handler: Arc<Box<dyn Handler>>, mut receiver: mpsc::Receiver<Message>) {
let backfill_jobs = Arc::new(Mutex::new(HashMap::new())); let backfill_jobs = Arc::new(Mutex::new(Jobs {
symbol_to_uuid: HashMap::new(),
uuid_to_job: HashMap::new(),
}));
loop { loop {
let message = receiver.recv().await.unwrap(); let message = receiver.recv().await.unwrap();
@@ -78,7 +120,7 @@ pub async fn run(handler: Arc<Box<dyn Handler>>, mut receiver: mpsc::Receiver<Me
async fn handle_backfill_message( async fn handle_backfill_message(
handler: Arc<Box<dyn Handler>>, handler: Arc<Box<dyn Handler>>,
backfill_jobs: Arc<Mutex<HashMap<String, JoinHandle<()>>>>, backfill_jobs: Arc<Mutex<Jobs>>,
message: Message, message: Message,
) { ) {
let mut backfill_jobs = backfill_jobs.lock().await; let mut backfill_jobs = backfill_jobs.lock().await;
@@ -86,6 +128,7 @@ async fn handle_backfill_message(
match message.action { match message.action {
Action::Backfill => { Action::Backfill => {
let log_string = handler.log_string(); let log_string = handler.log_string();
let max_limit = handler.max_limit();
let backfills = handler let backfills = handler
.select_latest_backfills(&message.symbols) .select_latest_backfills(&message.symbols)
@@ -95,6 +138,8 @@ async fn handle_backfill_message(
.map(|backfill| (backfill.symbol.clone(), backfill)) .map(|backfill| (backfill.symbol.clone(), backfill))
.collect::<HashMap<_, _>>(); .collect::<HashMap<_, _>>();
let mut jobs = vec![];
for symbol in message.symbols { for symbol in message.symbols {
if let Some(job) = backfill_jobs.get(&symbol) { if let Some(job) = backfill_jobs.get(&symbol) {
if !job.is_finished() { if !job.is_finished() {
@@ -119,14 +164,49 @@ async fn handle_backfill_message(
return; return;
} }
jobs.push((
symbol,
Job {
fetch_from,
fetch_to,
},
));
}
let jobs = jobs
.into_iter()
.sorted_by_key(|job| job.1.fetch_from)
.collect::<Vec<_>>();
let mut job_groups = vec![HashMap::new()];
let mut current_minutes = 0;
for job in jobs {
let minutes = (job.1.fetch_to - job.1.fetch_from).whole_minutes();
if job_groups.last().unwrap().is_empty() || (current_minutes + minutes) <= max_limit
{
let job_group = job_groups.last_mut().unwrap();
job_group.insert(job.0, job.1);
current_minutes += minutes;
} else {
let mut job_group = HashMap::new();
job_group.insert(job.0, job.1);
job_groups.push(job_group);
current_minutes = minutes;
}
}
for job_group in job_groups {
let symbols = job_group.keys().cloned().collect::<Vec<_>>();
let handler = handler.clone(); let handler = handler.clone();
backfill_jobs.insert( let fut = spawn(async move {
symbol.clone(), handler.queue_backfill(&job_group).await;
spawn(async move { handler.backfill(job_group).await;
handler.queue_backfill(&symbol, fetch_to).await; });
handler.backfill(symbol, fetch_from, fetch_to).await;
}), backfill_jobs.insert(symbols, fut);
);
} }
} }
Action::Purge => { Action::Purge => {
@@ -154,7 +234,7 @@ struct BarHandler {
config: Arc<Config>, config: Arc<Config>,
data_url: &'static str, data_url: &'static str,
api_query_constructor: fn( api_query_constructor: fn(
symbol: String, symbols: Vec<String>,
fetch_from: OffsetDateTime, fetch_from: OffsetDateTime,
fetch_to: OffsetDateTime, fetch_to: OffsetDateTime,
next_page_token: Option<String>, next_page_token: Option<String>,
@@ -162,31 +242,33 @@ struct BarHandler {
} }
fn us_equity_query_constructor( fn us_equity_query_constructor(
symbol: String, symbols: Vec<String>,
fetch_from: OffsetDateTime, fetch_from: OffsetDateTime,
fetch_to: OffsetDateTime, fetch_to: OffsetDateTime,
next_page_token: Option<String>, next_page_token: Option<String>,
) -> alpaca::api::outgoing::bar::Bar { ) -> alpaca::api::outgoing::bar::Bar {
alpaca::api::outgoing::bar::Bar::UsEquity(alpaca::api::outgoing::bar::UsEquity { alpaca::api::outgoing::bar::Bar::UsEquity(alpaca::api::outgoing::bar::UsEquity {
symbols: vec![symbol], symbols,
start: Some(fetch_from), start: Some(fetch_from),
end: Some(fetch_to), end: Some(fetch_to),
page_token: next_page_token, page_token: next_page_token,
sort: Some(Sort::Asc),
..Default::default() ..Default::default()
}) })
} }
fn crypto_query_constructor( fn crypto_query_constructor(
symbol: String, symbols: Vec<String>,
fetch_from: OffsetDateTime, fetch_from: OffsetDateTime,
fetch_to: OffsetDateTime, fetch_to: OffsetDateTime,
next_page_token: Option<String>, next_page_token: Option<String>,
) -> alpaca::api::outgoing::bar::Bar { ) -> alpaca::api::outgoing::bar::Bar {
alpaca::api::outgoing::bar::Bar::Crypto(alpaca::api::outgoing::bar::Crypto { alpaca::api::outgoing::bar::Bar::Crypto(alpaca::api::outgoing::bar::Crypto {
symbols: vec![symbol], symbols,
start: Some(fetch_from), start: Some(fetch_from),
end: Some(fetch_to), end: Some(fetch_to),
page_token: next_page_token, page_token: next_page_token,
sort: Some(Sort::Asc),
..Default::default() ..Default::default()
}) })
} }
@@ -223,18 +305,31 @@ impl Handler for BarHandler {
.await .await
} }
async fn queue_backfill(&self, symbol: &str, fetch_to: OffsetDateTime) { async fn queue_backfill(&self, jobs: &HashMap<String, Job>) {
if *ALPACA_SOURCE == Source::Iex { if *ALPACA_SOURCE == Source::Sip {
return;
}
let fetch_to = jobs.values().map(|job| job.fetch_to).max().unwrap();
let run_delay = duration_until(fetch_to + FIFTEEN_MINUTES + ONE_MINUTE); let run_delay = duration_until(fetch_to + FIFTEEN_MINUTES + ONE_MINUTE);
info!("Queing bar backfill for {} in {:?}.", symbol, run_delay); let symbols = jobs.keys().collect::<Vec<_>>();
info!("Queing bar backfill for {:?} in {:?}.", symbols, run_delay);
sleep(run_delay).await; sleep(run_delay).await;
} }
}
async fn backfill(&self, symbol: String, fetch_from: OffsetDateTime, fetch_to: OffsetDateTime) { async fn backfill(&self, jobs: HashMap<String, Job>) {
info!("Backfilling bars for {}.", symbol); let symbols = jobs.keys().cloned().collect::<Vec<_>>();
let fetch_from = jobs.values().map(|job| job.fetch_from).min().unwrap();
let fetch_to = jobs.values().map(|job| job.fetch_to).max().unwrap();
info!("Backfilling bars for {:?}.", symbols);
let mut bars = vec![]; let mut bars = vec![];
let mut last_time = symbols
.iter()
.map(|symbol| (symbol.clone(), None))
.collect::<HashMap<_, _>>();
let mut next_page_token = None; let mut next_page_token = None;
loop { loop {
@@ -243,7 +338,7 @@ impl Handler for BarHandler {
&self.config.alpaca_rate_limiter, &self.config.alpaca_rate_limiter,
self.data_url, self.data_url,
&(self.api_query_constructor)( &(self.api_query_constructor)(
symbol.clone(), symbols.clone(),
fetch_from, fetch_from,
fetch_to, fetch_to,
next_page_token.clone(), next_page_token.clone(),
@@ -252,29 +347,21 @@ impl Handler for BarHandler {
) )
.await .await
else { else {
error!("Failed to backfill bars for {}.", symbol); error!("Failed to backfill bars for {:?}.", symbols);
return; return;
}; };
message.bars.into_iter().for_each(|(symbol, bar_vec)| { for (symbol, bar_vec) in message.bars {
if let Some(last) = bar_vec.last() {
last_time.insert(symbol.clone(), Some(last.time));
}
for bar in bar_vec { for bar in bar_vec {
bars.push(Bar::from((bar, symbol.clone()))); bars.push(Bar::from((bar, symbol.clone())));
} }
});
if message.next_page_token.is_none() {
break;
}
next_page_token = message.next_page_token;
} }
if bars.is_empty() { if bars.len() >= database::bars::BATCH_FLUSH_SIZE || message.next_page_token.is_none() {
info!("No bars to backfill for {}.", symbol);
return;
}
let backfill = bars.last().unwrap().clone().into();
database::bars::upsert_batch( database::bars::upsert_batch(
&self.config.clickhouse_client, &self.config.clickhouse_client,
&self.config.clickhouse_concurrency_limiter, &self.config.clickhouse_concurrency_limiter,
@@ -282,15 +369,38 @@ impl Handler for BarHandler {
) )
.await .await
.unwrap(); .unwrap();
database::backfills_bars::upsert( bars = vec![];
}
if message.next_page_token.is_none() {
break;
}
next_page_token = message.next_page_token;
}
let (backfilled, skipped): (Vec<_>, Vec<_>) =
last_time.into_iter().partition_map(|(symbol, time)| {
if let Some(time) = time {
Either::Left(Backfill { symbol, time })
} else {
Either::Right(symbol)
}
});
database::backfills_bars::upsert_batch(
&self.config.clickhouse_client, &self.config.clickhouse_client,
&self.config.clickhouse_concurrency_limiter, &self.config.clickhouse_concurrency_limiter,
&backfill, &backfilled,
) )
.await .await
.unwrap(); .unwrap();
info!("Backfilled bars for {}.", symbol); info!("No bars to backfill for {:?}.", skipped);
info!("Backfilled bars for {:?}.", backfilled);
}
fn max_limit(&self) -> i64 {
alpaca::api::outgoing::bar::MAX_LIMIT
} }
fn log_string(&self) -> &'static str { fn log_string(&self) -> &'static str {
@@ -334,16 +444,31 @@ impl Handler for NewsHandler {
.await .await
} }
async fn queue_backfill(&self, symbol: &str, fetch_to: OffsetDateTime) { async fn queue_backfill(&self, jobs: &HashMap<String, Job>) {
if *ALPACA_SOURCE == Source::Sip {
return;
}
let fetch_to = jobs.values().map(|job| job.fetch_to).max().unwrap();
let run_delay = duration_until(fetch_to + FIFTEEN_MINUTES + ONE_MINUTE); let run_delay = duration_until(fetch_to + FIFTEEN_MINUTES + ONE_MINUTE);
info!("Queing news backfill for {} in {:?}.", symbol, run_delay); let symbols = jobs.keys().cloned().collect::<Vec<_>>();
info!("Queing news backfill for {:?} in {:?}.", symbols, run_delay);
sleep(run_delay).await; sleep(run_delay).await;
} }
async fn backfill(&self, symbol: String, fetch_from: OffsetDateTime, fetch_to: OffsetDateTime) { async fn backfill(&self, jobs: HashMap<String, Job>) {
info!("Backfilling news for {}.", symbol); let symbols = jobs.keys().cloned().collect::<Vec<_>>();
let fetch_from = jobs.values().map(|job| job.fetch_from).min().unwrap();
let fetch_to = jobs.values().map(|job| job.fetch_to).max().unwrap();
info!("Backfilling news for {:?}.", symbols);
let mut news = vec![]; let mut news = vec![];
let mut last_time = symbols
.iter()
.map(|symbol| (symbol.clone(), None))
.collect::<HashMap<_, _>>();
let mut next_page_token = None; let mut next_page_token = None;
loop { loop {
@@ -351,7 +476,7 @@ impl Handler for NewsHandler {
&self.config.alpaca_client, &self.config.alpaca_client,
&self.config.alpaca_rate_limiter, &self.config.alpaca_rate_limiter,
&alpaca::api::outgoing::news::News { &alpaca::api::outgoing::news::News {
symbols: vec![symbol.clone()], symbols: symbols.clone(),
start: Some(fetch_from), start: Some(fetch_from),
end: Some(fetch_to), end: Some(fetch_to),
page_token: next_page_token.clone(), page_token: next_page_token.clone(),
@@ -361,31 +486,28 @@ impl Handler for NewsHandler {
) )
.await .await
else { else {
error!("Failed to backfill news for {}.", symbol); error!("Failed to backfill news for {:?}.", symbols);
return; return;
}; };
message.news.into_iter().for_each(|news_item| { for news_item in message.news {
news.push(News::from(news_item)); let news_item = News::from(news_item);
});
if message.next_page_token.is_none() { for symbol in &news_item.symbols {
break; last_time.insert(symbol.clone(), Some(news_item.time_created));
}
next_page_token = message.next_page_token;
} }
if news.is_empty() { news.push(news_item);
info!("No news to backfill for {}.", symbol);
return;
} }
if news.len() >= *BERT_MAX_INPUTS || message.next_page_token.is_none() {
let inputs = news let inputs = news
.iter() .iter()
.map(|news| format!("{}\n\n{}", news.headline, news.content)) .map(|news| format!("{}\n\n{}", news.headline, news.content))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let predictions = join_all(inputs.chunks(*BERT_MAX_INPUTS).map(|inputs| async move { let predictions =
join_all(inputs.chunks(*BERT_MAX_INPUTS).map(|inputs| async move {
let sequence_classifier = self.config.sequence_classifier.lock().await; let sequence_classifier = self.config.sequence_classifier.lock().await;
block_in_place(|| { block_in_place(|| {
sequence_classifier sequence_classifier
@@ -399,7 +521,7 @@ impl Handler for NewsHandler {
.into_iter() .into_iter()
.flatten(); .flatten();
let news = news news = news
.into_iter() .into_iter()
.zip(predictions) .zip(predictions)
.map(|(news, prediction)| News { .map(|(news, prediction)| News {
@@ -408,9 +530,9 @@ impl Handler for NewsHandler {
..news ..news
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();
}
let backfill = (news.last().unwrap().clone(), symbol.clone()).into(); if news.len() >= database::news::BATCH_FLUSH_SIZE || message.next_page_token.is_none() {
database::news::upsert_batch( database::news::upsert_batch(
&self.config.clickhouse_client, &self.config.clickhouse_client,
&self.config.clickhouse_concurrency_limiter, &self.config.clickhouse_concurrency_limiter,
@@ -418,15 +540,38 @@ impl Handler for NewsHandler {
) )
.await .await
.unwrap(); .unwrap();
database::backfills_news::upsert( news = vec![];
}
if message.next_page_token.is_none() {
break;
}
next_page_token = message.next_page_token;
}
let (backfilled, skipped): (Vec<_>, Vec<_>) =
last_time.into_iter().partition_map(|(symbol, time)| {
if let Some(time) = time {
Either::Left(Backfill { symbol, time })
} else {
Either::Right(symbol)
}
});
database::backfills_news::upsert_batch(
&self.config.clickhouse_client, &self.config.clickhouse_client,
&self.config.clickhouse_concurrency_limiter, &self.config.clickhouse_concurrency_limiter,
&backfill, &backfilled,
) )
.await .await
.unwrap(); .unwrap();
info!("Backfilled news for {}.", symbol); info!("No news to backfill for {:?}.", skipped);
info!("Backfilled news for {:?}.", backfilled);
}
fn max_limit(&self) -> i64 {
alpaca::api::outgoing::news::MAX_LIMIT
} }
fn log_string(&self) -> &'static str { fn log_string(&self) -> &'static str {

View File

@@ -7,6 +7,8 @@ use serde::Serialize;
use std::time::Duration; use std::time::Duration;
use time::OffsetDateTime; use time::OffsetDateTime;
pub const MAX_LIMIT: i64 = 10_000;
#[derive(Serialize)] #[derive(Serialize)]
#[serde(rename_all = "snake_case")] #[serde(rename_all = "snake_case")]
#[allow(dead_code)] #[allow(dead_code)]
@@ -53,7 +55,7 @@ impl Default for UsEquity {
timeframe: ONE_MINUTE, timeframe: ONE_MINUTE,
start: None, start: None,
end: None, end: None,
limit: Some(10000), limit: Some(MAX_LIMIT),
adjustment: Some(Adjustment::All), adjustment: Some(Adjustment::All),
asof: None, asof: None,
feed: Some(*ALPACA_SOURCE), feed: Some(*ALPACA_SOURCE),
@@ -91,7 +93,7 @@ impl Default for Crypto {
timeframe: ONE_MINUTE, timeframe: ONE_MINUTE,
start: None, start: None,
end: None, end: None,
limit: Some(10000), limit: Some(MAX_LIMIT),
page_token: None, page_token: None,
sort: Some(Sort::Asc), sort: Some(Sort::Asc),
} }

View File

@@ -2,6 +2,8 @@ use crate::{types::alpaca::shared::Sort, utils::ser};
use serde::Serialize; use serde::Serialize;
use time::OffsetDateTime; use time::OffsetDateTime;
pub const MAX_LIMIT: i64 = 50;
#[derive(Serialize)] #[derive(Serialize)]
pub struct News { pub struct News {
#[serde(serialize_with = "ser::remove_slash_from_pairs_join_symbols")] #[serde(serialize_with = "ser::remove_slash_from_pairs_join_symbols")]
@@ -30,7 +32,7 @@ impl Default for News {
symbols: vec![], symbols: vec![],
start: None, start: None,
end: None, end: None,
limit: Some(50), limit: Some(MAX_LIMIT),
include_content: Some(true), include_content: Some(true),
exclude_contentless: Some(false), exclude_contentless: Some(false),
page_token: None, page_token: None,