fix(ah-staking): stall detection grace period, MinerPages fix, and simulation tools

- Add 3-session grace period to stall detection to allow RC XCM round-trip
  before triggering era recovery (StallDetectionCount storage added)
- Fix plan_new_era() to always increment CurrentEra regardless of
  ElectionProvider::start() result, preventing infinite retry loops
- Fix MinerPages from 2 to 32 to match Pages config (was causing
  incomplete OCW solutions and election failures)
- Bump AH spec_version to 1_020_007
- Add subxt example scripts for simulation and mainnet operations
- Remove obsolete fix_force_era.rs (replaced by sim_reset_election.rs)
This commit is contained in:
2026-02-19 17:16:43 +03:00
parent 21d1bc2375
commit cc156a1d61
17 changed files with 3254 additions and 130 deletions
@@ -847,6 +847,17 @@ pub mod pezpallet {
pub type ElectableStashes<T: Config> =
StorageValue<_, BoundedBTreeSet<T::AccountId, T::MaxValidatorSet>, ValueQuery>;
/// Counts consecutive sessions where a stall condition was detected but recovery
/// was deferred to allow the relay chain time to respond.
///
/// After an election completes and the validator set is sent to the relay chain,
/// there is an XCM round-trip delay before the relay chain sends back the
/// `activation_timestamp`. This counter prevents the stall detection from
/// prematurely reverting the planned era. Stall recovery only triggers after the
/// counter reaches [`session_rotation::STALL_GRACE_SESSIONS`].
#[pezpallet::storage]
pub type StallDetectionCount<T: Config> = StorageValue<_, u32, ValueQuery>;
/// Tracks the current step of era pruning process for each era being lazily pruned.
#[pezpallet::storage]
pub type EraPruningState<T: Config> = StorageMap<_, Twox64Concat, EraIndex, PruningStep>;
@@ -88,6 +88,15 @@ use pezsp_staking::{
currency_to_vote::CurrencyToVote, Exposure, Page, PagedExposureMetadata, SessionIndex,
};
/// Number of consecutive sessions to wait before triggering stall recovery.
///
/// After an election completes and the validator set is sent to the relay chain,
/// the RC needs time for the XCM round-trip (receive validator set → process at
/// session boundary → send activation_timestamp back). This grace period prevents
/// premature era reverts. 3 sessions is sufficient for both production (3 hours)
/// and fast-runtime simulation (12 minutes).
pub(crate) const STALL_GRACE_SESSIONS: u32 = 3;
/// A handler for all era-based storage items.
///
/// All of the following storage items must be controlled by this type:
@@ -677,22 +686,42 @@ impl<T: Config> Rotator<T> {
// Detect zombie pending era: election completed but produced 0 winners,
// RC never sent activation_timestamp. Break the deadlock by reverting
// the planned era and re-planning with a fresh election.
//
// IMPORTANT: After the election completes and the validator set is sent
// to the relay chain via XCM, there is a round-trip delay before the RC
// responds with the activation_timestamp. We use a grace period
// (STALL_GRACE_SESSIONS) to avoid prematurely reverting the era.
let election_idle = T::ElectionProvider::status().is_err();
let not_fetching = NextElectionPage::<T>::get().is_none();
if election_idle && not_fetching {
crate::log!(
warn,
"Detected stalled pending era {:?}: election finished but era was \
never activated. Reverting planned era and re-planning.",
current_planned_era
);
let active = Self::active_era();
CurrentEra::<T>::put(active);
EraElectionPlanner::<T>::cleanup();
Pezpallet::<T>::deposit_event(Event::Unexpected(
UnexpectedKind::StalledEraRecovery,
));
Self::plan_new_era();
let count = StallDetectionCount::<T>::get();
if count >= STALL_GRACE_SESSIONS {
crate::log!(
warn,
"Detected stalled pending era {:?}: election finished \
but era was never activated after {} sessions. \
Reverting planned era and re-planning.",
current_planned_era,
count
);
let active = Self::active_era();
CurrentEra::<T>::put(active);
EraElectionPlanner::<T>::cleanup();
Pezpallet::<T>::deposit_event(Event::Unexpected(
UnexpectedKind::StalledEraRecovery,
));
Self::plan_new_era();
} else {
StallDetectionCount::<T>::put(count + 1);
crate::log!(
info,
"Waiting for RC activation of pending era {:?} \
(grace {}/{}).",
current_planned_era,
count + 1,
STALL_GRACE_SESSIONS
);
}
} else {
crate::log!(
debug,
@@ -854,13 +883,16 @@ impl<T: Config> Rotator<T> {
/// Plans a new era by kicking off the election process.
///
/// The newly planned era is targeted to activate in the next session.
///
/// If the election provider is already running (e.g., `Err(Ongoing)`), we still
/// increment `CurrentEra` to mark the era as "planning". The ongoing election's
/// results will be attributed to this planned era when fetched by
/// [`EraElectionPlanner::maybe_fetch_election_results`].
fn plan_new_era() {
let _ = CurrentEra::<T>::try_mutate(|x| {
log!(info, "Planning new era: {:?}, sending election start signal", x.unwrap_or(0));
let could_start_election = EraElectionPlanner::<T>::plan_new_election();
*x = Some(x.unwrap_or(0) + 1);
could_start_election
});
let current = CurrentEra::<T>::get().unwrap_or(0);
log!(info, "Planning new era: {:?}, sending election start signal", current);
let _ = EraElectionPlanner::<T>::plan_new_election();
CurrentEra::<T>::put(current + 1);
}
/// Returns whether we are at the session where we should plan the new era.
@@ -914,7 +946,8 @@ impl<T: Config> EraElectionPlanner<T> {
VoterSnapshotStatus::<T>::kill();
NextElectionPage::<T>::kill();
ElectableStashes::<T>::kill();
Pezpallet::<T>::register_weight(T::DbWeight::get().writes(3));
StallDetectionCount::<T>::kill();
Pezpallet::<T>::register_weight(T::DbWeight::get().writes(4));
}
/// Fetches the number of pages configured by the election provider.