Add a retry layer to all providers (#224)

* Add a `ReceiptRetryLayer` for providers

* Fix the retry layer

* Rename the retry layer

* Remove outdated polling function

* Remoe unneeded dependencies
This commit is contained in:
Omar
2026-01-13 22:39:47 +03:00
committed by GitHub
parent d38e6d419d
commit 2d3602aaed
17 changed files with 492 additions and 528 deletions
+13 -92
View File
@@ -3,7 +3,6 @@
use std::{
fs::{File, create_dir_all, remove_dir_all},
io::Read,
ops::ControlFlow,
path::PathBuf,
pin::Pin,
process::{Command, Stdio},
@@ -35,12 +34,9 @@ use anyhow::Context as _;
use futures::{FutureExt, Stream, StreamExt};
use revive_common::EVMVersion;
use tokio::sync::OnceCell;
use tracing::{Instrument, error, instrument};
use tracing::{error, instrument};
use revive_dt_common::{
fs::clear_directory,
futures::{PollingWaitBehavior, poll},
};
use revive_dt_common::fs::clear_directory;
use revive_dt_config::*;
use revive_dt_format::traits::ResolverApi;
use revive_dt_node_interaction::EthereumNode;
@@ -90,12 +86,6 @@ impl GethNode {
const READY_MARKER: &str = "IPC endpoint opened";
const ERROR_MARKER: &str = "Fatal:";
const TRANSACTION_INDEXING_ERROR: &str = "transaction indexing is in progress";
const TRANSACTION_TRACING_ERROR: &str = "historical state not available in path scheme yet";
const RECEIPT_POLLING_DURATION: Duration = Duration::from_secs(5 * 60);
const TRACE_POLLING_DURATION: Duration = Duration::from_secs(60);
pub fn new(
context: impl AsRef<WorkingDirectoryConfiguration>
+ AsRef<WalletConfiguration>
@@ -341,62 +331,15 @@ impl EthereumNode for GethNode {
transaction: TransactionRequest,
) -> Pin<Box<dyn Future<Output = anyhow::Result<TransactionReceipt>> + '_>> {
Box::pin(async move {
let provider = self
.provider()
self.provider()
.await
.context("Failed to create provider for transaction submission")?;
let pending_transaction = provider
.context("Failed to create provider for transaction submission")?
.send_transaction(transaction)
.await
.inspect_err(
|err| error!(%err, "Encountered an error when submitting the transaction"),
)
.context("Failed to submit transaction to geth node")?;
let transaction_hash = *pending_transaction.tx_hash();
// The following is a fix for the "transaction indexing is in progress" error that we used
// to get. You can find more information on this in the following GH issue in geth
// https://github.com/ethereum/go-ethereum/issues/28877. To summarize what's going on,
// before we can get the receipt of the transaction it needs to have been indexed by the
// node's indexer. Just because the transaction has been confirmed it doesn't mean that it
// has been indexed. When we call alloy's `get_receipt` it checks if the transaction was
// confirmed. If it has been, then it will call `eth_getTransactionReceipt` method which
// _might_ return the above error if the tx has not yet been indexed yet. So, we need to
// implement a retry mechanism for the receipt to keep retrying to get it until it
// eventually works, but we only do that if the error we get back is the "transaction
// indexing is in progress" error or if the receipt is None.
//
// Getting the transaction indexed and taking a receipt can take a long time especially when
// a lot of transactions are being submitted to the node. Thus, while initially we only
// allowed for 60 seconds of waiting with a 1 second delay in polling, we need to allow for
// a larger wait time. Therefore, in here we allow for 5 minutes of waiting with exponential
// backoff each time we attempt to get the receipt and find that it's not available.
poll(
Self::RECEIPT_POLLING_DURATION,
PollingWaitBehavior::Constant(Duration::from_millis(200)),
move || {
let provider = provider.clone();
async move {
match provider.get_transaction_receipt(transaction_hash).await {
Ok(Some(receipt)) => Ok(ControlFlow::Break(receipt)),
Ok(None) => Ok(ControlFlow::Continue(())),
Err(error) => {
let error_string = error.to_string();
match error_string.contains(Self::TRANSACTION_INDEXING_ERROR) {
true => Ok(ControlFlow::Continue(())),
false => Err(error.into()),
}
}
}
}
},
)
.instrument(tracing::info_span!(
"Awaiting transaction receipt",
?transaction_hash
))
.await
.context("Encountered an error when submitting a transaction")?
.get_receipt()
.await
.context("Failed to get the receipt for the transaction")
})
}
@@ -407,34 +350,12 @@ impl EthereumNode for GethNode {
trace_options: GethDebugTracingOptions,
) -> Pin<Box<dyn Future<Output = anyhow::Result<GethTrace>> + '_>> {
Box::pin(async move {
let provider = self
.provider()
self.provider()
.await
.context("Failed to create provider for tracing")?;
poll(
Self::TRACE_POLLING_DURATION,
PollingWaitBehavior::Constant(Duration::from_millis(200)),
move || {
let provider = provider.clone();
let trace_options = trace_options.clone();
async move {
match provider
.debug_trace_transaction(tx_hash, trace_options)
.await
{
Ok(trace) => Ok(ControlFlow::Break(trace)),
Err(error) => {
let error_string = error.to_string();
match error_string.contains(Self::TRANSACTION_TRACING_ERROR) {
true => Ok(ControlFlow::Continue(())),
false => Err(error.into()),
}
}
}
}
},
)
.await
.context("Failed to create provider for tracing")?
.debug_trace_transaction(tx_hash, trace_options)
.await
.context("Failed to get the transaction trace")
})
}
@@ -12,7 +12,6 @@ use std::{
collections::{BTreeMap, HashSet},
fs::{File, create_dir_all},
io::Read,
ops::ControlFlow,
path::PathBuf,
pin::Pin,
process::{Command, Stdio},
@@ -48,12 +47,9 @@ use revive_common::EVMVersion;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde_with::serde_as;
use tokio::sync::OnceCell;
use tracing::{Instrument, info, instrument};
use tracing::{info, instrument};
use revive_dt_common::{
fs::clear_directory,
futures::{PollingWaitBehavior, poll},
};
use revive_dt_common::fs::clear_directory;
use revive_dt_config::*;
use revive_dt_format::traits::ResolverApi;
use revive_dt_node_interaction::EthereumNode;
@@ -116,12 +112,6 @@ impl LighthouseGethNode {
const CONFIG_FILE_NAME: &str = "config.yaml";
const TRANSACTION_INDEXING_ERROR: &str = "transaction indexing is in progress";
const TRANSACTION_TRACING_ERROR: &str = "historical state not available in path scheme yet";
const RECEIPT_POLLING_DURATION: Duration = Duration::from_secs(5 * 60);
const TRACE_POLLING_DURATION: Duration = Duration::from_secs(60);
const VALIDATOR_MNEMONIC: &str = "giant issue aisle success illegal bike spike question tent bar rely arctic volcano long crawl hungry vocal artwork sniff fantasy very lucky have athlete";
pub fn new(
@@ -481,73 +471,6 @@ impl LighthouseGethNode {
Ok(())
}
fn internal_execute_transaction<'a>(
transaction: TransactionRequest,
provider: FillProvider<
impl TxFiller<Ethereum> + 'a,
impl Provider<Ethereum> + Clone + 'a,
Ethereum,
>,
) -> Pin<Box<dyn Future<Output = anyhow::Result<TransactionReceipt>> + 'a>> {
Box::pin(async move {
let pending_transaction = provider
.send_transaction(transaction)
.await
.inspect_err(|err| {
tracing::error!(
%err,
"Encountered an error when submitting the transaction"
)
})
.context("Failed to submit transaction to geth node")?;
let transaction_hash = *pending_transaction.tx_hash();
// The following is a fix for the "transaction indexing is in progress" error that we
// used to get. You can find more information on this in the following GH issue in geth
// https://github.com/ethereum/go-ethereum/issues/28877. To summarize what's going on,
// before we can get the receipt of the transaction it needs to have been indexed by the
// node's indexer. Just because the transaction has been confirmed it doesn't mean that
// it has been indexed. When we call alloy's `get_receipt` it checks if the transaction
// was confirmed. If it has been, then it will call `eth_getTransactionReceipt` method
// which _might_ return the above error if the tx has not yet been indexed yet. So, we
// need to implement a retry mechanism for the receipt to keep retrying to get it until
// it eventually works, but we only do that if the error we get back is the "transaction
// indexing is in progress" error or if the receipt is None.
//
// Getting the transaction indexed and taking a receipt can take a long time especially
// when a lot of transactions are being submitted to the node. Thus, while initially we
// only allowed for 60 seconds of waiting with a 1 second delay in polling, we need to
// allow for a larger wait time. Therefore, in here we allow for 5 minutes of waiting
// with exponential backoff each time we attempt to get the receipt and find that it's
// not available.
poll(
Self::RECEIPT_POLLING_DURATION,
PollingWaitBehavior::Constant(Duration::from_millis(500)),
move || {
let provider = provider.clone();
async move {
match provider.get_transaction_receipt(transaction_hash).await {
Ok(Some(receipt)) => Ok(ControlFlow::Break(receipt)),
Ok(None) => Ok(ControlFlow::Continue(())),
Err(error) => {
let error_string = error.to_string();
match error_string.contains(Self::TRANSACTION_INDEXING_ERROR) {
true => Ok(ControlFlow::Continue(())),
false => Err(error.into()),
}
}
}
}
},
)
.instrument(tracing::info_span!(
"Awaiting transaction receipt",
?transaction_hash
))
.await
})
}
pub fn node_genesis(mut genesis: Genesis, wallet: &EthereumWallet) -> Genesis {
for signer_address in NetworkWallet::<Ethereum>::signer_addresses(&wallet) {
genesis
@@ -626,11 +549,15 @@ impl EthereumNode for LighthouseGethNode {
transaction: TransactionRequest,
) -> Pin<Box<dyn Future<Output = anyhow::Result<TransactionReceipt>> + '_>> {
Box::pin(async move {
let provider = self
.http_provider()
self.provider()
.await
.context("Failed to create provider for transaction execution")?;
Self::internal_execute_transaction(transaction, provider).await
.context("Failed to create provider for transaction submission")?
.send_transaction(transaction)
.await
.context("Encountered an error when submitting a transaction")?
.get_receipt()
.await
.context("Failed to get the receipt for the transaction")
})
}
@@ -641,35 +568,12 @@ impl EthereumNode for LighthouseGethNode {
trace_options: GethDebugTracingOptions,
) -> Pin<Box<dyn Future<Output = anyhow::Result<GethTrace>> + '_>> {
Box::pin(async move {
let provider = Arc::new(
self.http_provider()
.await
.context("Failed to create provider for tracing")?,
);
poll(
Self::TRACE_POLLING_DURATION,
PollingWaitBehavior::Constant(Duration::from_millis(200)),
move || {
let provider = provider.clone();
let trace_options = trace_options.clone();
async move {
match provider
.debug_trace_transaction(tx_hash, trace_options)
.await
{
Ok(trace) => Ok(ControlFlow::Break(trace)),
Err(error) => {
let error_string = error.to_string();
match error_string.contains(Self::TRANSACTION_TRACING_ERROR) {
true => Ok(ControlFlow::Continue(())),
false => Err(error.into()),
}
}
}
}
},
)
.await
self.provider()
.await
.context("Failed to create provider for tracing")?
.debug_trace_transaction(tx_hash, trace_options)
.await
.context("Failed to get the transaction trace")
})
}
@@ -50,10 +50,7 @@ use crate::{
Node,
constants::INITIAL_BALANCE,
helpers::{Process, ProcessReadinessWaitBehavior},
provider_utils::{
ConcreteProvider, FallbackGasFiller, construct_concurrency_limited_provider,
execute_transaction,
},
provider_utils::{ConcreteProvider, FallbackGasFiller, construct_concurrency_limited_provider},
};
static NODE_COUNT: AtomicU32 = AtomicU32::new(0);
@@ -464,11 +461,15 @@ impl EthereumNode for PolkadotOmnichainNode {
transaction: TransactionRequest,
) -> Pin<Box<dyn Future<Output = anyhow::Result<TransactionReceipt>> + '_>> {
Box::pin(async move {
let provider = self
.provider()
self.provider()
.await
.context("Failed to create the provider")?;
execute_transaction(provider, transaction).await
.context("Failed to create provider for transaction submission")?
.send_transaction(transaction)
.await
.context("Encountered an error when submitting a transaction")?
.get_receipt()
.await
.context("Failed to get the receipt for the transaction")
})
}
@@ -49,10 +49,7 @@ use crate::{
Node,
constants::INITIAL_BALANCE,
helpers::{Process, ProcessReadinessWaitBehavior},
provider_utils::{
ConcreteProvider, FallbackGasFiller, construct_concurrency_limited_provider,
execute_transaction,
},
provider_utils::{ConcreteProvider, FallbackGasFiller, construct_concurrency_limited_provider},
};
static NODE_COUNT: AtomicU32 = AtomicU32::new(0);
@@ -434,11 +431,15 @@ impl EthereumNode for SubstrateNode {
transaction: TransactionRequest,
) -> Pin<Box<dyn Future<Output = anyhow::Result<TransactionReceipt>> + '_>> {
Box::pin(async move {
let provider = self
.provider()
self.provider()
.await
.context("Failed to create the provider")?;
execute_transaction(provider, transaction).await
.context("Failed to create provider for transaction submission")?
.send_transaction(transaction)
.await
.context("Encountered an error when submitting a transaction")?
.get_receipt()
.await
.context("Failed to get the receipt for the transaction")
})
}
@@ -76,10 +76,7 @@ use crate::{
Node,
constants::INITIAL_BALANCE,
helpers::{Process, ProcessReadinessWaitBehavior},
provider_utils::{
ConcreteProvider, FallbackGasFiller, construct_concurrency_limited_provider,
execute_transaction,
},
provider_utils::{ConcreteProvider, FallbackGasFiller, construct_concurrency_limited_provider},
};
static NODE_COUNT: AtomicU32 = AtomicU32::new(0);
@@ -433,14 +430,18 @@ impl EthereumNode for ZombienetNode {
fn execute_transaction(
&self,
transaction: alloy::rpc::types::TransactionRequest,
transaction: TransactionRequest,
) -> Pin<Box<dyn Future<Output = anyhow::Result<TransactionReceipt>> + '_>> {
Box::pin(async move {
let provider = self
.provider()
self.provider()
.await
.context("Failed to create the provider")?;
execute_transaction(provider, transaction).await
.context("Failed to create provider for transaction submission")?
.send_transaction(transaction)
.await
.context("Encountered an error when submitting a transaction")?
.get_receipt()
.await
.context("Failed to get the receipt for the transaction")
})
}
@@ -1,5 +1,3 @@
use std::{borrow::Cow, fmt::Display};
use alloy::{
eips::BlockNumberOrTag,
network::{Network, TransactionBuilder},
@@ -111,28 +109,23 @@ where
},
state_overrides: Default::default(),
block_overrides: Default::default(),
tx_index: Default::default(),
},
)
.await?
.try_into_call_frame()
.map_err(|err| {
RpcError::LocalUsageError(
FallbackGasFillerError::new(format!(
"Expected a callframe trace, but got: {err:?}"
))
.boxed(),
RpcError::local_usage_str(
format!("Expected a callframe trace, but got: {err:?}").as_str(),
)
})?;
let gas_used = u64::try_from(trace.gas_used).map_err(|_| {
RpcError::LocalUsageError(
FallbackGasFillerError::new(
"Transaction trace returned a value of gas used that exceeds u64",
)
.boxed(),
RpcError::local_usage_str(
"Transaction trace returned a value of gas used that exceeds u64",
)
})?;
let gas_limit = gas_used.saturating_mul(120) / 100;
let gas_limit = gas_used.saturating_mul(2);
if let Some(gas_price) = tx.gas_price() {
return Ok(GasFillable::Legacy {
@@ -174,24 +167,3 @@ impl Default for FallbackGasFiller {
Self::new()
}
}
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
struct FallbackGasFillerError(Cow<'static, str>);
impl FallbackGasFillerError {
pub fn new(string: impl Into<Cow<'static, str>>) -> Self {
Self(string.into())
}
pub fn boxed(self) -> Box<Self> {
Box::new(self)
}
}
impl Display for FallbackGasFillerError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
Display::fmt(&self.0, f)
}
}
impl std::error::Error for FallbackGasFillerError {}
+2
View File
@@ -1,7 +1,9 @@
mod concurrency_limiter;
mod fallback_gas_filler;
mod provider;
mod receipt_retry_layer;
pub use concurrency_limiter::*;
pub use fallback_gas_filler::*;
pub use provider::*;
pub use receipt_retry_layer::*;
+5 -73
View File
@@ -1,18 +1,16 @@
use std::{ops::ControlFlow, sync::LazyLock, time::Duration};
use std::sync::LazyLock;
use alloy::{
network::{Ethereum, Network, NetworkWallet, TransactionBuilder4844},
network::{Network, NetworkWallet, TransactionBuilder4844},
providers::{
Identity, PendingTransactionBuilder, Provider, ProviderBuilder, RootProvider,
Identity, ProviderBuilder, RootProvider,
fillers::{ChainIdFiller, FillProvider, JoinFill, NonceFiller, TxFiller, WalletFiller},
},
rpc::client::ClientBuilder,
};
use anyhow::{Context, Result};
use revive_dt_common::futures::{PollingWaitBehavior, poll};
use tracing::{Instrument, debug, info, info_span};
use crate::provider_utils::{ConcurrencyLimiterLayer, FallbackGasFiller};
use crate::provider_utils::{ConcurrencyLimiterLayer, FallbackGasFiller, RetryLayer};
pub type ConcreteProvider<N, W> = FillProvider<
JoinFill<
@@ -48,6 +46,7 @@ where
let client = ClientBuilder::default()
.layer(GLOBAL_CONCURRENCY_LIMITER_LAYER.clone())
.layer(RetryLayer::default())
.connect(rpc_url)
.await
.context("Failed to construct the RPC client")?;
@@ -63,70 +62,3 @@ where
Ok(provider)
}
pub async fn execute_transaction<N, W>(
provider: ConcreteProvider<N, W>,
transaction: N::TransactionRequest,
) -> Result<N::ReceiptResponse>
where
N: Network<
TransactionRequest: TransactionBuilder4844,
TxEnvelope = <Ethereum as Network>::TxEnvelope,
>,
W: NetworkWallet<N>,
Identity: TxFiller<N>,
FallbackGasFiller: TxFiller<N>,
ChainIdFiller: TxFiller<N>,
NonceFiller: TxFiller<N>,
WalletFiller<W>: TxFiller<N>,
{
let sendable_transaction = provider
.fill(transaction)
.await
.context("Failed to fill transaction")?;
let transaction_envelope = sendable_transaction
.try_into_envelope()
.context("Failed to convert transaction into an envelope")?;
let tx_hash = *transaction_envelope.tx_hash();
let mut pending_transaction = match provider.send_tx_envelope(transaction_envelope).await {
Ok(pending_transaction) => pending_transaction,
Err(error) => {
let error_string = error.to_string();
if error_string.contains("Transaction Already Imported") {
PendingTransactionBuilder::<N>::new(provider.root().clone(), tx_hash)
} else {
return Err(error).context(format!("Failed to submit transaction {tx_hash}"));
}
}
};
debug!(%tx_hash, "Submitted Transaction");
pending_transaction.set_timeout(Some(Duration::from_secs(120)));
let tx_hash = pending_transaction.watch().await.context(format!(
"Transaction inclusion watching timeout for {tx_hash}"
))?;
poll(
Duration::from_secs(60),
PollingWaitBehavior::Constant(Duration::from_secs(3)),
|| {
let provider = provider.clone();
async move {
match provider.get_transaction_receipt(tx_hash).await {
Ok(Some(receipt)) => {
info!("Found the transaction receipt");
Ok(ControlFlow::Break(receipt))
}
_ => Ok(ControlFlow::Continue(())),
}
}
},
)
.instrument(info_span!("Polling for receipt", %tx_hash))
.await
.context(format!("Polling for receipt failed for {tx_hash}"))
}
@@ -0,0 +1,158 @@
use std::time::Duration;
use alloy::{
network::{AnyNetwork, Network},
rpc::json_rpc::{RequestPacket, ResponsePacket},
transports::{TransportError, TransportErrorKind, TransportFut},
};
use tokio::time::{interval, timeout};
use tower::{Layer, Service};
/// A layer that allows for automatic retries for getting the receipt.
///
/// There are certain cases where getting the receipt of a committed transaction might fail. In Geth
/// this can happen if the transaction has been committed to the ledger but has not been indexed, in
/// the substrate and revive stack it can also happen for other reasons.
///
/// Therefore, just because the first attempt to get the receipt (after transaction confirmation)
/// has failed it doesn't mean that it will continue to fail. This layer can be added to any alloy
/// provider to allow the provider to retry getting the receipt for some period of time before it
/// considers that a timeout. It attempts to poll for the receipt for the `polling_duration` with an
/// interval of `polling_interval` between each poll. If by the end of the `polling_duration` it was
/// not able to get the receipt successfully then this is considered to be a timeout.
///
/// Additionally, this layer allows for retries for other rpc methods such as all tracing methods.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct RetryLayer {
/// The amount of time to keep polling for the receipt before considering it a timeout.
polling_duration: Duration,
/// The interval of time to wait between each poll for the receipt.
polling_interval: Duration,
}
impl RetryLayer {
pub fn new(polling_duration: Duration, polling_interval: Duration) -> Self {
Self {
polling_duration,
polling_interval,
}
}
pub fn with_polling_duration(mut self, polling_duration: Duration) -> Self {
self.polling_duration = polling_duration;
self
}
pub fn with_polling_interval(mut self, polling_interval: Duration) -> Self {
self.polling_interval = polling_interval;
self
}
}
impl Default for RetryLayer {
fn default() -> Self {
Self {
polling_duration: Duration::from_secs(90),
polling_interval: Duration::from_millis(500),
}
}
}
impl<S> Layer<S> for RetryLayer {
type Service = RetryService<S>;
fn layer(&self, inner: S) -> Self::Service {
RetryService {
service: inner,
polling_duration: self.polling_duration,
polling_interval: self.polling_interval,
}
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct RetryService<S> {
/// The internal service.
service: S,
/// The amount of time to keep polling for the receipt before considering it a timeout.
polling_duration: Duration,
/// The interval of time to wait between each poll for the receipt.
polling_interval: Duration,
}
impl<S> Service<RequestPacket> for RetryService<S>
where
S: Service<RequestPacket, Future = TransportFut<'static>, Error = TransportError>
+ Send
+ 'static
+ Clone,
{
type Response = ResponsePacket;
type Error = TransportError;
type Future = TransportFut<'static>;
fn poll_ready(
&mut self,
cx: &mut std::task::Context<'_>,
) -> std::task::Poll<Result<(), Self::Error>> {
self.service.poll_ready(cx)
}
#[allow(clippy::nonminimal_bool)]
fn call(&mut self, req: RequestPacket) -> Self::Future {
type ReceiptOutput = <AnyNetwork as Network>::ReceiptResponse;
let mut service = self.service.clone();
let polling_interval = self.polling_interval;
let polling_duration = self.polling_duration;
Box::pin(async move {
let request = req.as_single().ok_or_else(|| {
TransportErrorKind::custom_str("Retry layer doesn't support batch requests")
})?;
let method = request.method();
let requires_retries = method == "eth_getTransactionReceipt"
|| (method.contains("debug") && method.contains("trace"));
if !requires_retries {
return service.call(req).await;
}
timeout(polling_duration, async {
let mut interval = interval(polling_interval);
loop {
interval.tick().await;
let Ok(resp) = service.call(req.clone()).await else {
continue;
};
let response = resp.as_single().expect("Can't fail");
if response.is_error() {
continue;
}
if method == "eth_getTransactionReceipt"
&& response
.payload()
.clone()
.deserialize_success::<ReceiptOutput>()
.ok()
.and_then(|resp| resp.try_into_success().ok())
.is_some()
|| method != "eth_getTransactionReceipt"
{
return resp;
} else {
continue;
}
}
})
.await
.map_err(|_| TransportErrorKind::custom_str("Timeout when retrying request"))
})
}
}