Fix nothing scheduled on session boundary (#1403)

* Fix scheduled state at session boundaries.

* Cleanup + better docs.

* More cleanup and fixes.

* Remove 12s hack.

* Add dep.

* Make clippy happy

---------

Co-authored-by: eskimor <eskimor@no-such-url.com>
This commit is contained in:
eskimor
2023-09-06 18:19:21 +02:00
committed by GitHub
parent eeb368ed9c
commit 1e2a2f0c69
19 changed files with 489 additions and 643 deletions
+179 -100
View File
@@ -39,11 +39,11 @@
use crate::{configuration, initializer::SessionChangeNotification, paras};
use frame_support::pallet_prelude::*;
use frame_system::pallet_prelude::BlockNumberFor;
pub use polkadot_core_primitives::v2::BlockNumber;
use primitives::{
v5::ParasEntry, CoreIndex, CoreOccupied, GroupIndex, GroupRotationInfo, Id as ParaId,
ScheduledCore, ValidatorIndex,
CoreIndex, GroupIndex, GroupRotationInfo, Id as ParaId, ScheduledCore, ValidatorIndex,
};
use sp_runtime::traits::{One, Saturating};
use sp_runtime::traits::One;
use sp_std::{
collections::{btree_map::BTreeMap, vec_deque::VecDeque},
prelude::*,
@@ -51,7 +51,7 @@ use sp_std::{
pub mod common;
use common::{AssignmentProvider, AssignmentProviderConfig, CoreAssignment, FreedReason};
use common::{Assignment, AssignmentProvider, AssignmentProviderConfig};
pub use pallet::*;
@@ -101,6 +101,36 @@ pub mod pallet {
pub(crate) type AvailabilityCores<T: Config> =
StorageValue<_, Vec<CoreOccupied<BlockNumberFor<T>>>, ValueQuery>;
/// Representation of a core in `AvailabilityCores`.
///
/// This is not to be confused with `CoreState` which is an enriched variant of this and exposed
/// to the node side. It also provides information about scheduled/upcoming assignments for
/// example and is computed on the fly in the `availability_cores` runtime call.
#[derive(Clone, Encode, Decode, TypeInfo, RuntimeDebug)]
#[cfg_attr(feature = "std", derive(PartialEq))]
pub enum CoreOccupied<N> {
/// No candidate is waiting availability on this core right now (the core is not occupied).
Free,
/// A para is currently waiting for availability/inclusion on this core.
Paras(ParasEntry<N>),
}
impl<N> CoreOccupied<N> {
/// Is core free?
pub fn is_free(&self) -> bool {
matches!(self, Self::Free)
}
}
/// Reasons a core might be freed.
#[derive(Clone, Copy)]
pub enum FreedReason {
/// The core's work concluded and the parablock assigned to it is considered available.
Concluded,
/// The core's work timed out.
TimedOut,
}
/// The block number where the session start occurred. Used to track how many group rotations
/// have occurred.
///
@@ -124,6 +154,68 @@ pub mod pallet {
BTreeMap<CoreIndex, VecDeque<Option<ParasEntry<BlockNumberFor<T>>>>>,
ValueQuery,
>;
/// Assignments as tracked in the claim queue.
#[derive(Clone, Encode, Decode, TypeInfo, PartialEq, RuntimeDebug)]
pub struct ParasEntry<N = BlockNumber> {
/// The underlying `Assignment`
pub assignment: Assignment,
/// The number of times the entry has timed out in availability already.
pub availability_timeouts: u32,
/// The block height until this entry needs to be backed.
///
/// If missed the entry will be removed from the claim queue without ever having occupied
/// the core.
pub ttl: N,
}
impl<N> ParasEntry<N> {
/// Return `Id` from the underlying `Assignment`.
pub fn para_id(&self) -> ParaId {
self.assignment.para_id
}
/// Create a new `ParasEntry`.
pub fn new(assignment: Assignment, now: N) -> Self {
ParasEntry { assignment, availability_timeouts: 0, ttl: now }
}
}
/// How a core is mapped to a backing group and a `ParaId`
#[derive(Clone, Encode, Decode, PartialEq, TypeInfo)]
#[cfg_attr(feature = "std", derive(Debug))]
pub struct CoreAssignment<BlockNumber> {
/// The core that is assigned.
pub core: CoreIndex,
/// The para id and accompanying information needed to collate and back a parablock.
pub paras_entry: ParasEntry<BlockNumber>,
}
impl<BlockNumber> CoreAssignment<BlockNumber> {
/// Returns the [`ParaId`] of the assignment.
pub fn para_id(&self) -> ParaId {
self.paras_entry.para_id()
}
/// Returns the inner [`ParasEntry`] of the assignment.
pub fn to_paras_entry(self) -> ParasEntry<BlockNumber> {
self.paras_entry
}
}
/// Availability timeout status of a core.
pub(crate) struct AvailabilityTimeoutStatus<BlockNumber> {
/// Is the core already timed out?
///
/// If this is true the core will be freed at this block.
pub timed_out: bool,
/// When does this core timeout.
///
/// The block number the core times out. If `timed_out` is true, this will correspond to
/// now (current block number).
pub live_until: BlockNumber,
}
}
type PositionInClaimqueue = u32;
@@ -368,50 +460,47 @@ impl<T: Config> Pallet<T> {
Some(GroupIndex(group_idx as u32))
}
/// Returns an optional predicate that should be used for timing out occupied cores.
/// Returns a predicate that should be used for timing out occupied cores.
///
/// If `None`, no timing-out should be done. The predicate accepts the index of the core, and
/// the block number since which it has been occupied, and the respective parachain timeouts,
/// i.e. only within `config.paras_availability_period` of the last rotation would this return
/// `Some`, unless there are no rotations.
///
/// The timeout used to depend, but does not depend any more on group rotations. First of all
/// it only matters if a para got another chance (a retry). If there is a retry and it happens
/// still within the same group rotation a censoring backing group would need to censor again
/// and lose out again on backing rewards. This is bad for the censoring backing group, it does
/// not matter for the parachain as long as it is retried often enough (so it eventually gets a
/// try on another backing group) - the effect is similar to having a prolonged timeout. It
/// should also be noted that for both malicious and offline backing groups it is actually more
/// realistic that the candidate will not be backed to begin with, instead of getting backed
/// and then not made available.
/// This only ever times out cores that have been occupied across a group rotation boundary.
pub(crate) fn availability_timeout_predicate(
) -> Option<impl Fn(CoreIndex, BlockNumberFor<T>) -> bool> {
let now = <frame_system::Pallet<T>>::block_number();
) -> impl Fn(BlockNumberFor<T>) -> AvailabilityTimeoutStatus<BlockNumberFor<T>> {
let config = <configuration::Pallet<T>>::config();
let session_start = <SessionStartBlock<T>>::get();
let now = <frame_system::Pallet<T>>::block_number();
let rotation_info = Self::group_rotation_info(now);
let blocks_since_session_start = now.saturating_sub(session_start);
let blocks_since_last_rotation =
blocks_since_session_start % config.group_rotation_frequency.max(1u8.into());
let next_rotation = rotation_info.next_rotation_at();
if blocks_since_last_rotation >= config.paras_availability_period {
None
} else {
Some(|core_index: CoreIndex, pending_since| {
let availability_cores = AvailabilityCores::<T>::get();
let AssignmentProviderConfig { availability_period, .. } =
T::AssignmentProvider::get_provider_config(core_index);
let now = <frame_system::Pallet<T>>::block_number();
match availability_cores.get(core_index.0 as usize) {
None => true, // out-of-bounds, doesn't really matter what is returned.
Some(CoreOccupied::Free) => true, // core free, still doesn't matter.
Some(CoreOccupied::Paras(_)) =>
now.saturating_sub(pending_since) >= availability_period,
}
})
let times_out = Self::availability_timeout_check_required();
move |pending_since| {
let time_out_at = if times_out {
// We are at the beginning of the rotation, here availability period is relevant.
// Note: blocks backed in this rotation will never time out here as backed_in +
// config.paras_availability_period will always be > now for these blocks, as
// otherwise above condition would not be true.
pending_since + config.paras_availability_period
} else {
next_rotation + config.paras_availability_period
};
AvailabilityTimeoutStatus { timed_out: time_out_at <= now, live_until: time_out_at }
}
}
/// Is evaluation of `availability_timeout_predicate` necessary at the current block?
///
/// This can be used to avoid calling `availability_timeout_predicate` for each core in case
/// this function returns false.
pub(crate) fn availability_timeout_check_required() -> bool {
let config = <configuration::Pallet<T>>::config();
let now = <frame_system::Pallet<T>>::block_number() + One::one();
let rotation_info = Self::group_rotation_info(now);
let current_window = rotation_info.last_rotation_at() + config.paras_availability_period;
now < current_window
}
/// Returns a helper for determining group rotation.
pub(crate) fn group_rotation_info(
now: BlockNumberFor<T>,
@@ -508,7 +597,7 @@ impl<T: Config> Pallet<T> {
pub(crate) fn update_claimqueue(
just_freed_cores: impl IntoIterator<Item = (CoreIndex, FreedReason)>,
now: BlockNumberFor<T>,
) -> Vec<CoreAssignment<BlockNumberFor<T>>> {
) {
Self::move_claimqueue_forward();
Self::free_cores_and_fill_claimqueue(just_freed_cores, now)
}
@@ -534,61 +623,58 @@ impl<T: Config> Pallet<T> {
fn free_cores_and_fill_claimqueue(
just_freed_cores: impl IntoIterator<Item = (CoreIndex, FreedReason)>,
now: BlockNumberFor<T>,
) -> Vec<CoreAssignment<BlockNumberFor<T>>> {
) {
let (mut concluded_paras, mut timedout_paras) = Self::free_cores(just_freed_cores);
// This can only happen on new sessions at which we move all assignments back to the
// provider. Hence, there's nothing we need to do here.
if ValidatorGroups::<T>::get().is_empty() {
vec![]
} else {
let n_lookahead = Self::claimqueue_lookahead();
let n_session_cores = T::AssignmentProvider::session_core_count();
let cq = ClaimQueue::<T>::get();
let ttl = <configuration::Pallet<T>>::config().on_demand_ttl;
return
}
let n_lookahead = Self::claimqueue_lookahead();
let n_session_cores = T::AssignmentProvider::session_core_count();
let cq = ClaimQueue::<T>::get();
let ttl = <configuration::Pallet<T>>::config().on_demand_ttl;
for core_idx in 0..n_session_cores {
let core_idx = CoreIndex::from(core_idx);
for core_idx in 0..n_session_cores {
let core_idx = CoreIndex::from(core_idx);
// add previously timedout paras back into the queue
if let Some(mut entry) = timedout_paras.remove(&core_idx) {
let AssignmentProviderConfig { max_availability_timeouts, .. } =
T::AssignmentProvider::get_provider_config(core_idx);
if entry.availability_timeouts < max_availability_timeouts {
// Increment the timeout counter.
entry.availability_timeouts += 1;
// Reset the ttl so that a timed out assignment.
entry.ttl = now + ttl;
Self::add_to_claimqueue(core_idx, entry);
// The claim has been added back into the claimqueue.
// Do not pop another assignment for the core.
continue
} else {
// Consider timed out assignments for on demand parachains as concluded for
// the assignment provider
let ret = concluded_paras.insert(core_idx, entry.para_id());
debug_assert!(ret.is_none());
}
}
// We consider occupied cores to be part of the claimqueue
let n_lookahead_used = cq.get(&core_idx).map_or(0, |v| v.len() as u32) +
if Self::is_core_occupied(core_idx) { 1 } else { 0 };
for _ in n_lookahead_used..n_lookahead {
let concluded_para = concluded_paras.remove(&core_idx);
if let Some(assignment) =
T::AssignmentProvider::pop_assignment_for_core(core_idx, concluded_para)
{
Self::add_to_claimqueue(core_idx, ParasEntry::new(assignment, now + ttl));
}
// add previously timedout paras back into the queue
if let Some(mut entry) = timedout_paras.remove(&core_idx) {
let AssignmentProviderConfig { max_availability_timeouts, .. } =
T::AssignmentProvider::get_provider_config(core_idx);
if entry.availability_timeouts < max_availability_timeouts {
// Increment the timeout counter.
entry.availability_timeouts += 1;
// Reset the ttl so that a timed out assignment.
entry.ttl = now + ttl;
Self::add_to_claimqueue(core_idx, entry);
// The claim has been added back into the claimqueue.
// Do not pop another assignment for the core.
continue
} else {
// Consider timed out assignments for on demand parachains as concluded for
// the assignment provider
let ret = concluded_paras.insert(core_idx, entry.para_id());
debug_assert!(ret.is_none());
}
}
debug_assert!(timedout_paras.is_empty());
debug_assert!(concluded_paras.is_empty());
Self::scheduled_claimqueue()
// We consider occupied cores to be part of the claimqueue
let n_lookahead_used = cq.get(&core_idx).map_or(0, |v| v.len() as u32) +
if Self::is_core_occupied(core_idx) { 1 } else { 0 };
for _ in n_lookahead_used..n_lookahead {
let concluded_para = concluded_paras.remove(&core_idx);
if let Some(assignment) =
T::AssignmentProvider::pop_assignment_for_core(core_idx, concluded_para)
{
Self::add_to_claimqueue(core_idx, ParasEntry::new(assignment, now + ttl));
}
}
}
debug_assert!(timedout_paras.is_empty());
debug_assert!(concluded_paras.is_empty());
}
fn is_core_occupied(core_idx: CoreIndex) -> bool {
@@ -623,29 +709,22 @@ impl<T: Config> Pallet<T> {
.ok_or("remove returned None")?
.ok_or("Element in Claimqueue was None.")?;
// Since the core is now occupied, the next entry in the claimqueue in order to achieve
// 12 second block times needs to be None
if core_claims.front() != Some(&None) {
core_claims.push_front(None);
}
Ok((pos as u32, pe))
})
}
// TODO: Temporary to imitate the old schedule() call. Will be adjusted when we make the
// scheduler AB ready
pub(crate) fn scheduled_claimqueue() -> Vec<CoreAssignment<BlockNumberFor<T>>> {
/// Paras scheduled next in the claim queue.
pub(crate) fn scheduled_paras() -> impl Iterator<Item = (CoreIndex, ParaId)> {
Self::scheduled_entries().map(|(core_idx, e)| (core_idx, e.assignment.para_id))
}
/// Internal access to entries at the top of the claim queue.
fn scheduled_entries() -> impl Iterator<Item = (CoreIndex, ParasEntry<BlockNumberFor<T>>)> {
let claimqueue = ClaimQueue::<T>::get();
claimqueue
.into_iter()
.flat_map(|(core_idx, v)| {
v.front()
.cloned()
.flatten()
.map(|pe| CoreAssignment { core: core_idx, paras_entry: pe })
})
.collect()
.filter_map(|(core_idx, v)| v.front().cloned().flatten().map(|e| (core_idx, e)))
}
#[cfg(any(feature = "runtime-benchmarks", test))]