fix(ah-staking): stall detection grace period, MinerPages fix, and simulation tools
- Add 3-session grace period to stall detection to allow RC XCM round-trip before triggering era recovery (StallDetectionCount storage added) - Fix plan_new_era() to always increment CurrentEra regardless of ElectionProvider::start() result, preventing infinite retry loops - Fix MinerPages from 2 to 32 to match Pages config (was causing incomplete OCW solutions and election failures) - Bump AH spec_version to 1_020_007 - Add subxt example scripts for simulation and mainnet operations - Remove obsolete fix_force_era.rs (replaced by sim_reset_election.rs)
This commit is contained in:
@@ -847,6 +847,17 @@ pub mod pezpallet {
|
||||
pub type ElectableStashes<T: Config> =
|
||||
StorageValue<_, BoundedBTreeSet<T::AccountId, T::MaxValidatorSet>, ValueQuery>;
|
||||
|
||||
/// Counts consecutive sessions where a stall condition was detected but recovery
|
||||
/// was deferred to allow the relay chain time to respond.
|
||||
///
|
||||
/// After an election completes and the validator set is sent to the relay chain,
|
||||
/// there is an XCM round-trip delay before the relay chain sends back the
|
||||
/// `activation_timestamp`. This counter prevents the stall detection from
|
||||
/// prematurely reverting the planned era. Stall recovery only triggers after the
|
||||
/// counter reaches [`session_rotation::STALL_GRACE_SESSIONS`].
|
||||
#[pezpallet::storage]
|
||||
pub type StallDetectionCount<T: Config> = StorageValue<_, u32, ValueQuery>;
|
||||
|
||||
/// Tracks the current step of era pruning process for each era being lazily pruned.
|
||||
#[pezpallet::storage]
|
||||
pub type EraPruningState<T: Config> = StorageMap<_, Twox64Concat, EraIndex, PruningStep>;
|
||||
|
||||
@@ -88,6 +88,15 @@ use pezsp_staking::{
|
||||
currency_to_vote::CurrencyToVote, Exposure, Page, PagedExposureMetadata, SessionIndex,
|
||||
};
|
||||
|
||||
/// Number of consecutive sessions to wait before triggering stall recovery.
|
||||
///
|
||||
/// After an election completes and the validator set is sent to the relay chain,
|
||||
/// the RC needs time for the XCM round-trip (receive validator set → process at
|
||||
/// session boundary → send activation_timestamp back). This grace period prevents
|
||||
/// premature era reverts. 3 sessions is sufficient for both production (3 hours)
|
||||
/// and fast-runtime simulation (12 minutes).
|
||||
pub(crate) const STALL_GRACE_SESSIONS: u32 = 3;
|
||||
|
||||
/// A handler for all era-based storage items.
|
||||
///
|
||||
/// All of the following storage items must be controlled by this type:
|
||||
@@ -677,22 +686,42 @@ impl<T: Config> Rotator<T> {
|
||||
// Detect zombie pending era: election completed but produced 0 winners,
|
||||
// RC never sent activation_timestamp. Break the deadlock by reverting
|
||||
// the planned era and re-planning with a fresh election.
|
||||
//
|
||||
// IMPORTANT: After the election completes and the validator set is sent
|
||||
// to the relay chain via XCM, there is a round-trip delay before the RC
|
||||
// responds with the activation_timestamp. We use a grace period
|
||||
// (STALL_GRACE_SESSIONS) to avoid prematurely reverting the era.
|
||||
let election_idle = T::ElectionProvider::status().is_err();
|
||||
let not_fetching = NextElectionPage::<T>::get().is_none();
|
||||
if election_idle && not_fetching {
|
||||
crate::log!(
|
||||
warn,
|
||||
"Detected stalled pending era {:?}: election finished but era was \
|
||||
never activated. Reverting planned era and re-planning.",
|
||||
current_planned_era
|
||||
);
|
||||
let active = Self::active_era();
|
||||
CurrentEra::<T>::put(active);
|
||||
EraElectionPlanner::<T>::cleanup();
|
||||
Pezpallet::<T>::deposit_event(Event::Unexpected(
|
||||
UnexpectedKind::StalledEraRecovery,
|
||||
));
|
||||
Self::plan_new_era();
|
||||
let count = StallDetectionCount::<T>::get();
|
||||
if count >= STALL_GRACE_SESSIONS {
|
||||
crate::log!(
|
||||
warn,
|
||||
"Detected stalled pending era {:?}: election finished \
|
||||
but era was never activated after {} sessions. \
|
||||
Reverting planned era and re-planning.",
|
||||
current_planned_era,
|
||||
count
|
||||
);
|
||||
let active = Self::active_era();
|
||||
CurrentEra::<T>::put(active);
|
||||
EraElectionPlanner::<T>::cleanup();
|
||||
Pezpallet::<T>::deposit_event(Event::Unexpected(
|
||||
UnexpectedKind::StalledEraRecovery,
|
||||
));
|
||||
Self::plan_new_era();
|
||||
} else {
|
||||
StallDetectionCount::<T>::put(count + 1);
|
||||
crate::log!(
|
||||
info,
|
||||
"Waiting for RC activation of pending era {:?} \
|
||||
(grace {}/{}).",
|
||||
current_planned_era,
|
||||
count + 1,
|
||||
STALL_GRACE_SESSIONS
|
||||
);
|
||||
}
|
||||
} else {
|
||||
crate::log!(
|
||||
debug,
|
||||
@@ -854,13 +883,16 @@ impl<T: Config> Rotator<T> {
|
||||
/// Plans a new era by kicking off the election process.
|
||||
///
|
||||
/// The newly planned era is targeted to activate in the next session.
|
||||
///
|
||||
/// If the election provider is already running (e.g., `Err(Ongoing)`), we still
|
||||
/// increment `CurrentEra` to mark the era as "planning". The ongoing election's
|
||||
/// results will be attributed to this planned era when fetched by
|
||||
/// [`EraElectionPlanner::maybe_fetch_election_results`].
|
||||
fn plan_new_era() {
|
||||
let _ = CurrentEra::<T>::try_mutate(|x| {
|
||||
log!(info, "Planning new era: {:?}, sending election start signal", x.unwrap_or(0));
|
||||
let could_start_election = EraElectionPlanner::<T>::plan_new_election();
|
||||
*x = Some(x.unwrap_or(0) + 1);
|
||||
could_start_election
|
||||
});
|
||||
let current = CurrentEra::<T>::get().unwrap_or(0);
|
||||
log!(info, "Planning new era: {:?}, sending election start signal", current);
|
||||
let _ = EraElectionPlanner::<T>::plan_new_election();
|
||||
CurrentEra::<T>::put(current + 1);
|
||||
}
|
||||
|
||||
/// Returns whether we are at the session where we should plan the new era.
|
||||
@@ -914,7 +946,8 @@ impl<T: Config> EraElectionPlanner<T> {
|
||||
VoterSnapshotStatus::<T>::kill();
|
||||
NextElectionPage::<T>::kill();
|
||||
ElectableStashes::<T>::kill();
|
||||
Pezpallet::<T>::register_weight(T::DbWeight::get().writes(3));
|
||||
StallDetectionCount::<T>::kill();
|
||||
Pezpallet::<T>::register_weight(T::DbWeight::get().writes(4));
|
||||
}
|
||||
|
||||
/// Fetches the number of pages configured by the election provider.
|
||||
|
||||
Reference in New Issue
Block a user