Fix nothing scheduled on session boundary (#1403)

* Fix scheduled state at session boundaries. * Cleanup + better docs. * More cleanup and fixes. * Remove 12s hack. * Add dep. * Make clippy happy --------- Co-authored-by: eskimor <eskimor@no-such-url.com>
2026-05-31 15:41:02 +00:00 · 2023-09-06 18:19:21 +02:00
parent eeb368ed9c
commit 1e2a2f0c69
19 changed files with 489 additions and 643 deletions
@@ -39,11 +39,11 @@
 use crate::{configuration, initializer::SessionChangeNotification, paras};
 use frame_support::pallet_prelude::*;
 use frame_system::pallet_prelude::BlockNumberFor;
+pub use polkadot_core_primitives::v2::BlockNumber;
 use primitives::{
-	v5::ParasEntry, CoreIndex, CoreOccupied, GroupIndex, GroupRotationInfo, Id as ParaId,
-	ScheduledCore, ValidatorIndex,
+	CoreIndex, GroupIndex, GroupRotationInfo, Id as ParaId, ScheduledCore, ValidatorIndex,
 };
-use sp_runtime::traits::{One, Saturating};
+use sp_runtime::traits::One;
 use sp_std::{
 	collections::{btree_map::BTreeMap, vec_deque::VecDeque},
 	prelude::*,
@@ -51,7 +51,7 @@ use sp_std::{

 pub mod common;

-use common::{AssignmentProvider, AssignmentProviderConfig, CoreAssignment, FreedReason};
+use common::{Assignment, AssignmentProvider, AssignmentProviderConfig};

 pub use pallet::*;

@@ -101,6 +101,36 @@ pub mod pallet {
 	pub(crate) type AvailabilityCores<T: Config> =
 		StorageValue<_, Vec<CoreOccupied<BlockNumberFor<T>>>, ValueQuery>;

+	/// Representation of a core in `AvailabilityCores`.
+	///
+	/// This is not to be confused with `CoreState` which is an enriched variant of this and exposed
+	/// to the node side. It also provides information about scheduled/upcoming assignments for
+	/// example and is computed on the fly in the `availability_cores` runtime call.
+	#[derive(Clone, Encode, Decode, TypeInfo, RuntimeDebug)]
+	#[cfg_attr(feature = "std", derive(PartialEq))]
+	pub enum CoreOccupied<N> {
+		/// No candidate is waiting availability on this core right now (the core is not occupied).
+		Free,
+		/// A para is currently waiting for availability/inclusion on this core.
+		Paras(ParasEntry<N>),
+	}
+
+	impl<N> CoreOccupied<N> {
+		/// Is core free?
+		pub fn is_free(&self) -> bool {
+			matches!(self, Self::Free)
+		}
+	}
+
+	/// Reasons a core might be freed.
+	#[derive(Clone, Copy)]
+	pub enum FreedReason {
+		/// The core's work concluded and the parablock assigned to it is considered available.
+		Concluded,
+		/// The core's work timed out.
+		TimedOut,
+	}
+
 	/// The block number where the session start occurred. Used to track how many group rotations
 	/// have occurred.
 	///
@@ -124,6 +154,68 @@ pub mod pallet {
 		BTreeMap<CoreIndex, VecDeque<Option<ParasEntry<BlockNumberFor<T>>>>>,
 		ValueQuery,
 	>;
+
+	/// Assignments as tracked in the claim queue.
+	#[derive(Clone, Encode, Decode, TypeInfo, PartialEq, RuntimeDebug)]
+	pub struct ParasEntry<N = BlockNumber> {
+		/// The underlying `Assignment`
+		pub assignment: Assignment,
+		/// The number of times the entry has timed out in availability already.
+		pub availability_timeouts: u32,
+		/// The block height until this entry needs to be backed.
+		///
+		/// If missed the entry will be removed from the claim queue without ever having occupied
+		/// the core.
+		pub ttl: N,
+	}
+
+	impl<N> ParasEntry<N> {
+		/// Return `Id` from the underlying `Assignment`.
+		pub fn para_id(&self) -> ParaId {
+			self.assignment.para_id
+		}
+
+		/// Create a new `ParasEntry`.
+		pub fn new(assignment: Assignment, now: N) -> Self {
+			ParasEntry { assignment, availability_timeouts: 0, ttl: now }
+		}
+	}
+
+	/// How a core is mapped to a backing group and a `ParaId`
+	#[derive(Clone, Encode, Decode, PartialEq, TypeInfo)]
+	#[cfg_attr(feature = "std", derive(Debug))]
+	pub struct CoreAssignment<BlockNumber> {
+		/// The core that is assigned.
+		pub core: CoreIndex,
+		/// The para id and accompanying information needed to collate and back a parablock.
+		pub paras_entry: ParasEntry<BlockNumber>,
+	}
+
+	impl<BlockNumber> CoreAssignment<BlockNumber> {
+		/// Returns the [`ParaId`] of the assignment.
+		pub fn para_id(&self) -> ParaId {
+			self.paras_entry.para_id()
+		}
+
+		/// Returns the inner [`ParasEntry`] of the assignment.
+		pub fn to_paras_entry(self) -> ParasEntry<BlockNumber> {
+			self.paras_entry
+		}
+	}
+
+	/// Availability timeout status of a core.
+	pub(crate) struct AvailabilityTimeoutStatus<BlockNumber> {
+		/// Is the core already timed out?
+		///
+		/// If this is true the core will be freed at this block.
+		pub timed_out: bool,
+
+		/// When does this core timeout.
+		///
+		/// The block number the core times out. If `timed_out` is true, this will correspond to
+		/// now (current block number).
+		pub live_until: BlockNumber,
+	}
 }

 type PositionInClaimqueue = u32;
@@ -368,50 +460,47 @@ impl<T: Config> Pallet<T> {
 		Some(GroupIndex(group_idx as u32))
 	}

-	/// Returns an optional predicate that should be used for timing out occupied cores.
+	/// Returns a predicate that should be used for timing out occupied cores.
 	///
-	/// If `None`, no timing-out should be done. The predicate accepts the index of the core, and
-	/// the block number since which it has been occupied, and the respective parachain timeouts,
-	/// i.e. only within `config.paras_availability_period` of the last rotation would this return
-	/// `Some`, unless there are no rotations.
-	///
-	/// The timeout used to depend, but does not depend any more on group rotations. First of all
-	/// it only matters if a para got another chance (a retry). If there is a retry and it happens
-	/// still within the same group rotation a censoring backing group would need to censor again
-	/// and lose out again on backing rewards. This is bad for the censoring backing group, it does
-	/// not matter for the parachain as long as it is retried often enough (so it eventually gets a
-	/// try on another backing group) - the effect is similar to having a prolonged timeout. It
-	/// should also be noted that for both malicious and offline backing groups it is actually more
-	/// realistic that the candidate will not be backed to begin with, instead of getting backed
-	/// and then not made available.
+	/// This only ever times out cores that have been occupied across a group rotation boundary.
 	pub(crate) fn availability_timeout_predicate(
-	) -> Option<impl Fn(CoreIndex, BlockNumberFor<T>) -> bool> {
-		let now = <frame_system::Pallet<T>>::block_number();
+	) -> impl Fn(BlockNumberFor<T>) -> AvailabilityTimeoutStatus<BlockNumberFor<T>> {
 		let config = <configuration::Pallet<T>>::config();
-		let session_start = <SessionStartBlock<T>>::get();
+		let now = <frame_system::Pallet<T>>::block_number();
+		let rotation_info = Self::group_rotation_info(now);

-		let blocks_since_session_start = now.saturating_sub(session_start);
-		let blocks_since_last_rotation =
-			blocks_since_session_start % config.group_rotation_frequency.max(1u8.into());
+		let next_rotation = rotation_info.next_rotation_at();

-		if blocks_since_last_rotation >= config.paras_availability_period {
-			None
-		} else {
-			Some(|core_index: CoreIndex, pending_since| {
-				let availability_cores = AvailabilityCores::<T>::get();
-				let AssignmentProviderConfig { availability_period, .. } =
-					T::AssignmentProvider::get_provider_config(core_index);
-				let now = <frame_system::Pallet<T>>::block_number();
-				match availability_cores.get(core_index.0 as usize) {
-					None => true, // out-of-bounds, doesn't really matter what is returned.
-					Some(CoreOccupied::Free) => true, // core free, still doesn't matter.
-					Some(CoreOccupied::Paras(_)) =>
-						now.saturating_sub(pending_since) >= availability_period,
-				}
-			})
+		let times_out = Self::availability_timeout_check_required();
+
+		move |pending_since| {
+			let time_out_at = if times_out {
+				// We are at the beginning of the rotation, here availability period is relevant.
+				// Note: blocks backed in this rotation will never time out here as backed_in +
+				// config.paras_availability_period will always be > now for these blocks, as
+				// otherwise above condition would not be true.
+				pending_since + config.paras_availability_period
+			} else {
+				next_rotation + config.paras_availability_period
+			};
+
+			AvailabilityTimeoutStatus { timed_out: time_out_at <= now, live_until: time_out_at }
 		}
 	}

+	/// Is evaluation of `availability_timeout_predicate` necessary at the current block?
+	///
+	/// This can be used to avoid calling `availability_timeout_predicate` for each core in case
+	/// this function returns false.
+	pub(crate) fn availability_timeout_check_required() -> bool {
+		let config = <configuration::Pallet<T>>::config();
+		let now = <frame_system::Pallet<T>>::block_number() + One::one();
+		let rotation_info = Self::group_rotation_info(now);
+
+		let current_window = rotation_info.last_rotation_at() + config.paras_availability_period;
+		now < current_window
+	}
+
 	/// Returns a helper for determining group rotation.
 	pub(crate) fn group_rotation_info(
 		now: BlockNumberFor<T>,
@@ -508,7 +597,7 @@ impl<T: Config> Pallet<T> {
 	pub(crate) fn update_claimqueue(
 		just_freed_cores: impl IntoIterator<Item = (CoreIndex, FreedReason)>,
 		now: BlockNumberFor<T>,
-	) -> Vec<CoreAssignment<BlockNumberFor<T>>> {
+	) {
 		Self::move_claimqueue_forward();
 		Self::free_cores_and_fill_claimqueue(just_freed_cores, now)
 	}
@@ -534,61 +623,58 @@ impl<T: Config> Pallet<T> {
 	fn free_cores_and_fill_claimqueue(
 		just_freed_cores: impl IntoIterator<Item = (CoreIndex, FreedReason)>,
 		now: BlockNumberFor<T>,
-	) -> Vec<CoreAssignment<BlockNumberFor<T>>> {
+	) {
 		let (mut concluded_paras, mut timedout_paras) = Self::free_cores(just_freed_cores);

 		// This can only happen on new sessions at which we move all assignments back to the
 		// provider. Hence, there's nothing we need to do here.
 		if ValidatorGroups::<T>::get().is_empty() {
-			vec![]
-		} else {
-			let n_lookahead = Self::claimqueue_lookahead();
-			let n_session_cores = T::AssignmentProvider::session_core_count();
-			let cq = ClaimQueue::<T>::get();
-			let ttl = <configuration::Pallet<T>>::config().on_demand_ttl;
+			return
+		}
+		let n_lookahead = Self::claimqueue_lookahead();
+		let n_session_cores = T::AssignmentProvider::session_core_count();
+		let cq = ClaimQueue::<T>::get();
+		let ttl = <configuration::Pallet<T>>::config().on_demand_ttl;

-			for core_idx in 0..n_session_cores {
-				let core_idx = CoreIndex::from(core_idx);
+		for core_idx in 0..n_session_cores {
+			let core_idx = CoreIndex::from(core_idx);

-				// add previously timedout paras back into the queue
-				if let Some(mut entry) = timedout_paras.remove(&core_idx) {
-					let AssignmentProviderConfig { max_availability_timeouts, .. } =
-						T::AssignmentProvider::get_provider_config(core_idx);
-					if entry.availability_timeouts < max_availability_timeouts {
-						// Increment the timeout counter.
-						entry.availability_timeouts += 1;
-						// Reset the ttl so that a timed out assignment.
-						entry.ttl = now + ttl;
-						Self::add_to_claimqueue(core_idx, entry);
-						// The claim has been added back into the claimqueue.
-						// Do not pop another assignment for the core.
-						continue
-					} else {
-						// Consider timed out assignments for on demand parachains as concluded for
-						// the assignment provider
-						let ret = concluded_paras.insert(core_idx, entry.para_id());
-						debug_assert!(ret.is_none());
-					}
-				}
-
-				// We  consider occupied cores to be part of the claimqueue
-				let n_lookahead_used = cq.get(&core_idx).map_or(0, |v| v.len() as u32) +
-					if Self::is_core_occupied(core_idx) { 1 } else { 0 };
-				for _ in n_lookahead_used..n_lookahead {
-					let concluded_para = concluded_paras.remove(&core_idx);
-					if let Some(assignment) =
-						T::AssignmentProvider::pop_assignment_for_core(core_idx, concluded_para)
-					{
-						Self::add_to_claimqueue(core_idx, ParasEntry::new(assignment, now + ttl));
-					}
+			// add previously timedout paras back into the queue
+			if let Some(mut entry) = timedout_paras.remove(&core_idx) {
+				let AssignmentProviderConfig { max_availability_timeouts, .. } =
+					T::AssignmentProvider::get_provider_config(core_idx);
+				if entry.availability_timeouts < max_availability_timeouts {
+					// Increment the timeout counter.
+					entry.availability_timeouts += 1;
+					// Reset the ttl so that a timed out assignment.
+					entry.ttl = now + ttl;
+					Self::add_to_claimqueue(core_idx, entry);
+					// The claim has been added back into the claimqueue.
+					// Do not pop another assignment for the core.
+					continue
+				} else {
+					// Consider timed out assignments for on demand parachains as concluded for
+					// the assignment provider
+					let ret = concluded_paras.insert(core_idx, entry.para_id());
+					debug_assert!(ret.is_none());
 				}
 			}

-			debug_assert!(timedout_paras.is_empty());
-			debug_assert!(concluded_paras.is_empty());
-
-			Self::scheduled_claimqueue()
+			// We  consider occupied cores to be part of the claimqueue
+			let n_lookahead_used = cq.get(&core_idx).map_or(0, |v| v.len() as u32) +
+				if Self::is_core_occupied(core_idx) { 1 } else { 0 };
+			for _ in n_lookahead_used..n_lookahead {
+				let concluded_para = concluded_paras.remove(&core_idx);
+				if let Some(assignment) =
+					T::AssignmentProvider::pop_assignment_for_core(core_idx, concluded_para)
+				{
+					Self::add_to_claimqueue(core_idx, ParasEntry::new(assignment, now + ttl));
+				}
+			}
 		}
+
+		debug_assert!(timedout_paras.is_empty());
+		debug_assert!(concluded_paras.is_empty());
 	}

 	fn is_core_occupied(core_idx: CoreIndex) -> bool {
@@ -623,29 +709,22 @@ impl<T: Config> Pallet<T> {
 				.ok_or("remove returned None")?
 				.ok_or("Element in Claimqueue was None.")?;

-			// Since the core is now occupied, the next entry in the claimqueue in order to achieve
-			// 12 second block times needs to be None
-			if core_claims.front() != Some(&None) {
-				core_claims.push_front(None);
-			}
 			Ok((pos as u32, pe))
 		})
 	}

-	// TODO: Temporary to imitate the old schedule() call. Will be adjusted when we make the
-	// scheduler AB ready
-	pub(crate) fn scheduled_claimqueue() -> Vec<CoreAssignment<BlockNumberFor<T>>> {
+	/// Paras scheduled next in the claim queue.
+	pub(crate) fn scheduled_paras() -> impl Iterator<Item = (CoreIndex, ParaId)> {
+		Self::scheduled_entries().map(|(core_idx, e)| (core_idx, e.assignment.para_id))
+	}
+
+	/// Internal access to entries at the top of the claim queue.
+	fn scheduled_entries() -> impl Iterator<Item = (CoreIndex, ParasEntry<BlockNumberFor<T>>)> {
 		let claimqueue = ClaimQueue::<T>::get();

 		claimqueue
 			.into_iter()
-			.flat_map(|(core_idx, v)| {
-				v.front()
-					.cloned()
-					.flatten()
-					.map(|pe| CoreAssignment { core: core_idx, paras_entry: pe })
-			})
-			.collect()
+			.filter_map(|(core_idx, v)| v.front().cloned().flatten().map(|e| (core_idx, e)))
 	}

 	#[cfg(any(feature = "runtime-benchmarks", test))]