Reduce network bandwidth, improve parablock times: optimize approval-distribution (#5164)

* gossip-support: be explicit about dimensions

* some guide updates

* update network-bridge to distinguish x and y dimensions

* get everything to compile

* beginnings

* some TODOs

* polkadot runtime: use relevant_authorities

* make gossip topologies per-session

* better formatting

* gossip support: use current session validators

* expand in comment

* adjust tests and fix index bug

* add past/present/future connection test and clean up code

* fmt

* network bridge: updated types

* update protocols to new gossip topology message

* guide updates

* add session to BlockApprovalMeta

* add session to block info

* refactor knowledge and remove most unify logic

* start replacing gossip_peers with new SessionTopologies

* add routing information to message state

* add some utilities to SessionTopology

* implement new gossip topology logic

* re-implement unify_with_peer

* distribute assignments according to topology

* finish grid topology implementation

* refactor network bridge slightly

* issue connection requests on all past/present/future

* fmt

* address grumbles

* tighten invariants in unify_with_peer

* implement random propagation

* refactor: extract required routing adjustment logic

* some block-age logic

* aggressively propagate messages when finality is slow

* overhaul aggression system to have 3 levels

* add aggression metrics

* remove aggression L3

* reduce random circulation

* remove PeerData

* get approval tests compiling

* use btree_map in known_by to make deterministic

* Revert "use btree_map in known_by to make deterministic"

This reverts commit 330d65343a7bb6fe4dd0f24bd8dbc15c0cbdbd9d.

* test XY grid propagation

* remove stray println

* test unshared dimension propagation

* add random gossip check

* test unify_with_peer better

* test sending after getting gossip topology

* test L1 aggression on originator

* test L1 aggression for non-originators

* test non-originator aggression L2

* fnt

* ~spellcheck

* fix statement-distribution tests

* fix flaky test

* fix metrics typo

* re-send periodically

* test resending

* typo

Co-authored-by: Bernhard Schuster <bernhard@ahoi.io>

* add more metrics about apd messages

* add back unify_with_peer logs

* make Resend an enum

* be more explicit when resending

* fmt

* fix error

* add a TODO for refactoring

* remove debug metrics

* add some guide stuff

* fmt

* update runtime API in test-runtim

Co-authored-by: Bernhard Schuster <bernhard@ahoi.io>
This commit is contained in:
asynchronous rob
2022-04-19 13:26:55 -05:00
committed by GitHub
parent edfa24bbc5
commit 79ecc53801
25 changed files with 2563 additions and 499 deletions
@@ -10,11 +10,13 @@ polkadot-node-network-protocol = { path = "../protocol" }
polkadot-node-subsystem = { path = "../../subsystem" }
polkadot-node-subsystem-util = { path = "../../subsystem-util" }
polkadot-primitives = { path = "../../../primitives" }
rand = "0.8"
futures = "0.3.21"
gum = { package = "tracing-gum", path = "../../gum" }
[dev-dependencies]
sp-authority-discovery = { git = "https://github.com/paritytech/substrate", branch = "master" }
sp-core = { git = "https://github.com/paritytech/substrate", branch = "master", features = ["std"] }
polkadot-node-subsystem-util = { path = "../../subsystem-util" }
@@ -23,5 +25,6 @@ polkadot-node-subsystem-test-helpers = { path = "../../subsystem-test-helpers" }
assert_matches = "1.4.0"
schnorrkel = { version = "0.9.1", default-features = false }
rand_core = "0.5.1" # should match schnorrkel
rand_chacha = "0.3.1"
env_logger = "0.9.0"
log = "0.4.16"
File diff suppressed because it is too large Load Diff
@@ -25,6 +25,8 @@ struct MetricsInner {
assignments_imported_total: prometheus::Counter<prometheus::U64>,
approvals_imported_total: prometheus::Counter<prometheus::U64>,
unified_with_peer_total: prometheus::Counter<prometheus::U64>,
aggression_l1_messages_total: prometheus::Counter<prometheus::U64>,
aggression_l2_messages_total: prometheus::Counter<prometheus::U64>,
time_unify_with_peer: prometheus::Histogram,
time_import_pending_now_known: prometheus::Histogram,
@@ -69,6 +71,18 @@ impl Metrics {
.as_ref()
.map(|metrics| metrics.time_awaiting_approval_voting.start_timer())
}
pub(crate) fn on_aggression_l1(&self) {
if let Some(metrics) = &self.0 {
metrics.aggression_l1_messages_total.inc();
}
}
pub(crate) fn on_aggression_l2(&self) {
if let Some(metrics) = &self.0 {
metrics.aggression_l2_messages_total.inc();
}
}
}
impl MetricsTrait for Metrics {
@@ -95,6 +109,20 @@ impl MetricsTrait for Metrics {
)?,
registry,
)?,
aggression_l1_messages_total: prometheus::register(
prometheus::Counter::new(
"polkadot_parachain_approval_distribution_aggression_l1_messages_total",
"Number of messages in approval distribution for which aggression L1 has been triggered",
)?,
registry,
)?,
aggression_l2_messages_total: prometheus::register(
prometheus::Counter::new(
"polkadot_parachain_approval_distribution_aggression_l2_messages_total",
"Number of messages in approval distribution for which aggression L2 has been triggered",
)?,
registry,
)?,
time_unify_with_peer: prometheus::register(
prometheus::Histogram::with_opts(prometheus::HistogramOpts::new(
"polkadot_parachain_time_unify_with_peer",
File diff suppressed because it is too large Load Diff