Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ public static class Counter {
public static final String TXS = "tron:txs";
public static final String MINER = "tron:miner";
public static final String BLOCK_FORK = "tron:block_fork";
public static final String SR_SET_CHANGE = "tron:sr_set_change";
Comment thread
Sunny6889 marked this conversation as resolved.
public static final String P2P_ERROR = "tron:p2p_error";
public static final String P2P_DISCONNECT = "tron:p2p_disconnect";
public static final String INTERNAL_SERVICE_FAIL = "tron:internal_service_fail";
Expand Down Expand Up @@ -62,6 +63,7 @@ public static class Histogram {
public static final String MESSAGE_PROCESS_LATENCY = "tron:message_process_latency_seconds";
public static final String BLOCK_FETCH_LATENCY = "tron:block_fetch_latency_seconds";
public static final String BLOCK_RECEIVE_DELAY = "tron:block_receive_delay_seconds";
public static final String BLOCK_TRANSACTION_COUNT = "tron:block_transaction_count";

private Histogram() {
throw new IllegalStateException("Histogram");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ public static class Counter {
public static final String TXS_FAIL_SIG = "sig";
public static final String TXS_FAIL_TAPOS = "tapos";
public static final String TXS_FAIL_DUP = "dup";
public static final String SR_ADD = "add";
public static final String SR_REMOVE = "remove";

private Counter() {
throw new IllegalStateException("Counter");
Expand Down Expand Up @@ -66,6 +68,7 @@ private Gauge() {

// Histogram
public static class Histogram {
public static final String MINER = "miner";
public static final String TRAFFIC_IN = "in";
public static final String TRAFFIC_OUT = "out";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class MetricsCounter {
init(MetricKeys.Counter.TXS, "tron txs info .", "type", "detail");
init(MetricKeys.Counter.MINER, "tron miner info .", "miner", "type");
init(MetricKeys.Counter.BLOCK_FORK, "tron block fork info .", "type");
init(MetricKeys.Counter.SR_SET_CHANGE, "tron sr set change .", "action", "witness");
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The witness label uses addresses. Although the Active SR pool is relatively limited (27 slots), as SRs rotate in and out over time, label values will accumulate monotonically — they only grow, never shrink. This is a known Prometheus anti-pattern.

Practical impact assessment:

  • TRON's SR candidate pool is not infinite, and rotation frequency is low, so it won't explode in the short term

  • However, as a long-running node metric, months or years of operation could still accumulate a significant number of time series

Possible approaches:

  • If the address dimension is truly needed, keep the current design, but document this characteristic

  • Or remove the witness label entirely, keeping only action (add/remove), and output address details via logs instead. Follow the principle that Prometheus should be used for alerting and awareness, while logs are for precise investigation and root-cause analysis.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for raising the high-cardinality concern — it's a valid Prometheus best practice to watch for.

However, I believe the witness label is acceptable here given TRON's specific constraints:

  1. Bounded candidate pool: The SR candidate pool is finite. Even over years of operation, the total number of distinct addresses that have ever held an SR slot has a practical ceiling — far from the
    unbounded growth typical of high-cardinality anti-patterns.
  2. Low rotation frequency: SR set changes only occur at maintenance intervals (every 6 hours), so new label values are introduced very infrequently.
  3. Diagnostic value: The primary purpose of this metric is to identify which SR was added or removed. Dropping the witness label would reduce the metric to just "something changed", requiring operators to
    cross-reference logs — losing the at-a-glance visibility in Grafana dashboards that makes this metric useful in the first place.

In summary, the worst-case time series count is roughly candidate_pool_size × 2 (add/remove), which stays well within Prometheus's comfortable operating range.

That said, I agree it's worth documenting this characteristic. Could you suggest what form you'd prefer — a code comment at the metric registration site, a note in the PR description, or something else?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

prefer to a code comment at the metric registration site.

init(MetricKeys.Counter.P2P_ERROR, "tron p2p error info .", "type");
init(MetricKeys.Counter.P2P_DISCONNECT, "tron p2p disconnect .", "type");
init(MetricKeys.Counter.INTERNAL_SERVICE_FAIL, "internal Service fail.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public class MetricsHistogram {
init(MetricKeys.Histogram.JSONRPC_SERVICE_LATENCY, "JsonRpc Service latency.",
"method");
init(MetricKeys.Histogram.MINER_LATENCY, "miner latency.",
"miner");
MetricLabels.Histogram.MINER);
init(MetricKeys.Histogram.PING_PONG_LATENCY, "node ping pong latency.");
init(MetricKeys.Histogram.VERIFY_SIGN_LATENCY, "verify sign latency for trx , block.",
"type");
Expand All @@ -36,7 +36,7 @@ public class MetricsHistogram {
init(MetricKeys.Histogram.PROCESS_TRANSACTION_LATENCY, "process transaction latency.",
"type", "contract");
init(MetricKeys.Histogram.MINER_DELAY, "miner delay time, actualTime - planTime.",
"miner");
MetricLabels.Histogram.MINER);
init(MetricKeys.Histogram.UDP_BYTES, "udp_bytes traffic.",
"type");
init(MetricKeys.Histogram.TCP_BYTES, "tcp_bytes traffic.",
Expand All @@ -48,6 +48,11 @@ public class MetricsHistogram {
init(MetricKeys.Histogram.BLOCK_FETCH_LATENCY, "fetch block latency.");
init(MetricKeys.Histogram.BLOCK_RECEIVE_DELAY,
"receive block delay time, receiveTime - blockTime.");

init(MetricKeys.Histogram.BLOCK_TRANSACTION_COUNT,
"Distribution of transaction counts per block.",
new double[]{0, 10, 50, 100, 200, 500, 1000, 2000, 5000, 10000},
MetricLabels.Histogram.MINER);
}

private MetricsHistogram() {
Expand All @@ -62,6 +67,17 @@ private static void init(String name, String help, String... labels) {
.register());
}

private static void init(String name, String help, double[] buckets, String... labels) {
Histogram.Builder builder = Histogram.build()
.name(name)
.help(help)
.labelNames(labels);
if (buckets != null && buckets.length > 0) {
builder.buckets(buckets);
}
container.put(name, builder.register());
}

static Histogram.Timer startTimer(String key, String... labels) {
if (Metrics.enabled()) {
Histogram histogram = container.get(key);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
import com.codahale.metrics.Counter;
import com.google.protobuf.ByteString;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;
import lombok.Getter;
import lombok.Setter;
import org.bouncycastle.util.encoders.Hex;
Expand Down Expand Up @@ -42,6 +45,10 @@ public class BlockChainMetricManager {
private long failProcessBlockNum = 0;
@Setter
private String failProcessBlockReason = "";
// Only accessed from block processing thread (synchronized on blockLock)
private final Set<String> lastActiveWitnesses = new HashSet<>();
// To control SR set change metric update logic, -1 means not initialized
private long lastNextMaintenanceTime = -1;

public BlockChainInfo getBlockChainInfo() {
BlockChainInfo blockChainInfo = new BlockChainInfo();
Expand Down Expand Up @@ -164,11 +171,61 @@ public void applyBlock(BlockCapsule block) {
}

//TPS
if (block.getTransactions().size() > 0) {
MetricsUtil.meterMark(MetricsKey.BLOCKCHAIN_TPS, block.getTransactions().size());
Metrics.counterInc(MetricKeys.Counter.TXS, block.getTransactions().size(),
int txCount = block.getTransactions().size();
if (txCount > 0) {
MetricsUtil.meterMark(MetricsKey.BLOCKCHAIN_TPS, txCount);
Metrics.counterInc(MetricKeys.Counter.TXS, txCount,
MetricLabels.Counter.TXS_SUCCESS, MetricLabels.Counter.TXS_SUCCESS);
}
if (Metrics.enabled()) {
// Record transaction count distribution for all blocks (including empty blocks)
Metrics.histogramObserve(MetricKeys.Histogram.BLOCK_TRANSACTION_COUNT, txCount,
StringUtil.encode58Check(address));

// SR set change detection
long nextMaintenanceTime = dbManager.getDynamicPropertiesStore().getNextMaintenanceTime();
if (lastNextMaintenanceTime == -1) {
lastNextMaintenanceTime = nextMaintenanceTime;
lastActiveWitnesses.addAll(chainBaseManager.getWitnessScheduleStore().getActiveWitnesses()
.stream().map(w -> StringUtil.encode58Check(w.toByteArray()))
.collect(Collectors.toSet()));
} else if (nextMaintenanceTime != lastNextMaintenanceTime) {
Set<String> currentWitnesses =
chainBaseManager.getWitnessScheduleStore().getActiveWitnesses()
.stream()
.map(w -> StringUtil.encode58Check(w.toByteArray()))
.collect(Collectors.toSet());
if (nextMaintenanceTime < lastNextMaintenanceTime) {
// Fork rollback detected — reinitialize instead of diffing
lastActiveWitnesses.clear();
lastActiveWitnesses.addAll(currentWitnesses);
} else {
recordSrSetChange(currentWitnesses);
}
lastNextMaintenanceTime = nextMaintenanceTime;
}
}
}

private void recordSrSetChange(Set<String> currentWitnesses) {
Set<String> added = new HashSet<>(currentWitnesses);
added.removeAll(lastActiveWitnesses);

Set<String> removed = new HashSet<>(lastActiveWitnesses);
removed.removeAll(currentWitnesses);

for (String address : added) {
Metrics.counterInc(MetricKeys.Counter.SR_SET_CHANGE, 1,
MetricLabels.Counter.SR_ADD, address);
}
for (String address : removed) {
Metrics.counterInc(MetricKeys.Counter.SR_SET_CHANGE, 1,
MetricLabels.Counter.SR_REMOVE, address);
}
if (!added.isEmpty() || !removed.isEmpty()) {
lastActiveWitnesses.clear();
lastActiveWitnesses.addAll(currentWitnesses);
}
Comment thread
Sunny6889 marked this conversation as resolved.
}

private List<WitnessInfo> getSrList() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
Expand All @@ -25,6 +26,7 @@
import org.tron.common.utils.ByteArray;
import org.tron.common.utils.PublicMethod;
import org.tron.common.utils.Sha256Hash;
import org.tron.common.utils.StringUtil;
import org.tron.common.utils.Utils;
import org.tron.consensus.dpos.DposSlot;
import org.tron.core.ChainBaseManager;
Expand All @@ -38,6 +40,8 @@

@Slf4j(topic = "metric")
public class PrometheusApiServiceTest extends BaseTest {


static LocalDateTime localDateTime = LocalDateTime.now();
@Resource
private DposSlot dposSlot;
Expand Down Expand Up @@ -65,7 +69,7 @@ protected static void initParameter(CommonParameter parameter) {
parameter.setMetricsPrometheusEnable(true);
}

protected void check() throws Exception {
protected void check(byte[] address, Map<ByteString, String> witnessAndAccount) throws Exception {
Double memoryBytes = CollectorRegistry.defaultRegistry.getSampleValue(
"system_total_physical_memory_bytes");
Assert.assertNotNull(memoryBytes);
Expand All @@ -80,6 +84,55 @@ protected void check() throws Exception {
new String[] {"sync"}, new String[] {"false"});
Assert.assertNotNull(pushBlock);
Assert.assertEquals(pushBlock.intValue(), blocks + 1);

String minerBase58 = StringUtil.encode58Check(address);
// Query histogram bucket le="0.0" for empty blocks
Double emptyBlock = CollectorRegistry.defaultRegistry.getSampleValue(
"tron:block_transaction_count_bucket",
new String[] {MetricLabels.Histogram.MINER, "le"}, new String[] {minerBase58, "0.0"});

Assert.assertNotNull("Empty block bucket should exist for miner: " + minerBase58, emptyBlock);
Assert.assertEquals("Should have 1 empty block", 1, emptyBlock.intValue());

// Check SR_REMOVE for initial address (removed when addTestWitnessAndAccount() is called)
Double srRemoveCount = CollectorRegistry.defaultRegistry.getSampleValue(
"tron:sr_set_change_total",
new String[] {"action", "witness"},
new String[] {MetricLabels.Counter.SR_REMOVE, minerBase58}
);
Assert.assertNotNull(srRemoveCount);
Assert.assertEquals(1, srRemoveCount.intValue());

// Check SR_ADD and empty blocks for each new witness in witnessAndAccount
// (excluding initial address)
ByteString addressByteString = ByteString.copyFrom(address);
int totalNewWitnessEmptyBlocks = 0;
for (ByteString witnessAddress : witnessAndAccount.keySet()) {
if (witnessAddress.equals(addressByteString)) {
continue; // Skip initial address
}
String witnessBase58 = StringUtil.encode58Check(witnessAddress.toByteArray());

// Check SR_ADD
Double srAddCount = CollectorRegistry.defaultRegistry.getSampleValue(
"tron:sr_set_change_total",
new String[] {"action", "witness"},
new String[] {MetricLabels.Counter.SR_ADD, witnessBase58}
);
Assert.assertNotNull("SR_ADD should be recorded for witness: " + witnessBase58,
srAddCount);
Assert.assertEquals("Each new witness should have 1 SR_ADD record", 1,
srAddCount.intValue());

// Collect empty blocks count from histogram bucket
int witnessEmptyBlock = CollectorRegistry.defaultRegistry.getSampleValue(
"tron:block_transaction_count_bucket",
new String[] {MetricLabels.Histogram.MINER, "le"}, new String[] {witnessBase58, "0.0"})
.intValue();
totalNewWitnessEmptyBlocks += witnessEmptyBlock;
Comment thread
Sunny6889 marked this conversation as resolved.
}
Assert.assertEquals(blocks, totalNewWitnessEmptyBlocks);

Double errorLogs = CollectorRegistry.defaultRegistry.getSampleValue(
"tron:error_info_total", new String[] {"net"}, new String[] {MetricLabels.UNDEFINED});
Assert.assertNull(errorLogs);
Expand Down Expand Up @@ -130,10 +183,23 @@ public void testMetric() throws Exception {

Map<ByteString, String> witnessAndAccount = addTestWitnessAndAccount();
witnessAndAccount.put(ByteString.copyFrom(address), key);

// Explicitly update WitnessScheduleStore to remove initial address,
// triggering SR_REMOVE metric
List<ByteString> newActiveWitnesses = new ArrayList<>(witnessAndAccount.keySet());
newActiveWitnesses.remove(ByteString.copyFrom(address));
chainBaseManager.getWitnessScheduleStore().saveActiveWitnesses(newActiveWitnesses);

// Update nextMaintenanceTime to trigger SR set change detection
long nextMaintenanceTime =
chainBaseManager.getDynamicPropertiesStore().getNextMaintenanceTime();
chainBaseManager.getDynamicPropertiesStore().updateNextMaintenanceTime(
nextMaintenanceTime + 3600_000L);

for (int i = 0; i < blocks; i++) {
generateBlock(witnessAndAccount);
}
check();
check(address, witnessAndAccount);
}

private Map<ByteString, String> addTestWitnessAndAccount() {
Expand Down
Loading