Skip to content

Commit 0853880

Browse files
authored
GPU/TPC: Encode saturated qTot and tailLength in ClusterNative (#15472)
* Encode saturated qTot and tailLength in ClusterNative * Adjust cluster compression * Compute tail length as earliest tb to last tb from any tail * Add fallbacks when accessing properties of saturated clusters * Format * Fix OpenCL compilation * Fix OpenCL compilation (2)
1 parent f0ef58d commit 0853880

6 files changed

Lines changed: 158 additions & 17 deletions

File tree

DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h

Lines changed: 131 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#ifndef ALICEO2_DATAFORMATSTPC_CLUSTERNATIVE_H
1616
#define ALICEO2_DATAFORMATSTPC_CLUSTERNATIVE_H
1717
#ifndef GPUCA_GPUCODE_DEVICE
18+
#include <climits>
1819
#include <cstdint>
1920
#include <cstddef> // for size_t
2021
#include <utility>
@@ -62,6 +63,8 @@ struct ClusterNative {
6263
static constexpr int scalePadPacked = 64; //< ~60 is needed for 0.1mm precision, but power of two avoids rounding
6364
static constexpr int scaleSigmaTimePacked = 32; // 1/32nd of pad/timebin precision for cluster size
6465
static constexpr int scaleSigmaPadPacked = 32;
66+
static constexpr int scaleSaturatedQTot = 4;
67+
static constexpr int maxSaturatedQTot = USHRT_MAX * scaleSaturatedQTot;
6568

6669
uint32_t timeFlagsPacked; //< Contains the time in the lower 24 bits in a packed format, contains the flags in the
6770
// upper 8 bits
@@ -83,7 +86,15 @@ struct ClusterNative {
8386
}
8487

8588
GPUd() uint16_t getQmax() const { return qMax; }
86-
GPUd() uint16_t getQtot() const { return qTot; }
89+
GPUd() uint16_t getQtot() const
90+
{
91+
if (isSaturated()) [[unlikely]] {
92+
// Check for overflow, so return type can stay uint16
93+
auto sqtot = getSaturatedQtot();
94+
return sqtot <= USHRT_MAX ? sqtot : USHRT_MAX;
95+
}
96+
return qTot;
97+
}
8798
GPUd() uint8_t getFlags() const { return timeFlagsPacked >> 24; }
8899
GPUd() uint32_t getTimePacked() const { return timeFlagsPacked & 0xFFFFFF; }
89100
GPUd() void setTimePackedFlags(uint32_t timePacked, uint8_t flags)
@@ -119,7 +130,13 @@ struct ClusterNative {
119130
/// Y = (12.4 - 0.5 * (66 - 1)) * 4.16mm = -83.616mm
120131
GPUd() float getPad() const { return unpackPad(padPacked); }
121132
GPUd() void setPad(float pad) { padPacked = packPad(pad); }
122-
GPUd() float getSigmaTime() const { return float(sigmaTimePacked) * (1.f / scaleSigmaTimePacked); }
133+
GPUd() float getSigmaTime() const
134+
{
135+
if (isSaturated()) [[unlikely]] {
136+
return 0;
137+
}
138+
return float(sigmaTimePacked) * (1.f / scaleSigmaTimePacked);
139+
}
123140
GPUd() void setSigmaTime(float sigmaTime)
124141
{
125142
uint32_t tmp = sigmaTime * scaleSigmaTimePacked + 0.5;
@@ -138,6 +155,31 @@ struct ClusterNative {
138155
sigmaPadPacked = tmp;
139156
}
140157

158+
GPUd() bool isSaturated() const { return qMax >= 1023; }
159+
160+
GPUd() void setSaturatedQtot(uint32_t qtot)
161+
{
162+
if (qtot > maxSaturatedQTot) {
163+
qtot = maxSaturatedQTot;
164+
}
165+
this->qTot = (qtot + scaleSaturatedQTot / 2) / scaleSaturatedQTot;
166+
}
167+
168+
GPUd() uint32_t getSaturatedQtot() const
169+
{
170+
return uint32_t(qTot) * scaleSaturatedQTot;
171+
}
172+
173+
GPUd() void setSaturatedTailLength(uint32_t tail)
174+
{
175+
sigmaTimePacked = encodeTailLength(tail);
176+
}
177+
178+
GPUd() uint32_t getSaturatedTailLength() const
179+
{
180+
return decodeTailLength(sigmaTimePacked);
181+
}
182+
141183
GPUd() bool operator<(const ClusterNative& rhs) const
142184
{
143185
if (this->getTimePacked() != rhs.getTimePacked()) {
@@ -167,6 +209,93 @@ struct ClusterNative {
167209
this->qTot == rhs.qTot &&
168210
this->getFlags() == rhs.getFlags();
169211
}
212+
213+
private:
214+
static constexpr GPUd() uint32_t decodeTailLength(uint8_t code)
215+
{
216+
// Quantize tail length into 8bits.
217+
// Max expected length is 1500 tbs.
218+
// But allow outliers up to 8000 tbs.
219+
//
220+
// Full code layout is:
221+
//
222+
// | Code range | Decoded values | Step | Codes |
223+
// | ---------: | -------------: | ----: | ----: |
224+
// | `0..63` | `0..63` | `1` | `64` |
225+
// | `64..95` | `64..126` | `2` | `32` |
226+
// | `96..127` | `128..252` | `4` | `32` |
227+
// | `128..159` | `256..504` | `8` | `32` |
228+
// | `160..223` | `512..1520` | `16` | `64` |
229+
// | `224..239` | `1552..2032` | `32` | `16` |
230+
// | `240..255` | `2048..8048` | `400` | `16` |
231+
//
232+
233+
if (code < 64) {
234+
return code;
235+
}
236+
237+
if (code < 160) {
238+
uint32_t q = (uint32_t)code - 64u;
239+
uint32_t exponent = (q >> 5) + 1u; // 1, 2, 3
240+
uint32_t mantissa = q & 31u; // 0..31
241+
242+
return (32u + mantissa) << exponent;
243+
}
244+
245+
if (code < 224) {
246+
return 512u + 16u * ((uint32_t)code - 160u);
247+
}
248+
249+
if (code < 240) {
250+
return 1552u + 32u * ((uint32_t)code - 224u);
251+
}
252+
253+
return 2048u + 400u * ((uint32_t)code - 240u);
254+
}
255+
256+
static constexpr GPUd() uint8_t encodeTailLength(uint32_t value)
257+
{
258+
// Saturate above representable range.
259+
if (value >= decodeTailLength(255)) [[unlikely]] {
260+
return 255;
261+
}
262+
263+
// Binary search for the first code whose decoded value >= value.
264+
uint8_t lo = 0;
265+
uint8_t hi = 255;
266+
267+
while (lo < hi) {
268+
uint8_t mid = lo + ((hi - lo) >> 1);
269+
uint32_t decoded = decodeTailLength(mid);
270+
271+
if (decoded < value) {
272+
lo = mid + 1;
273+
} else {
274+
hi = mid;
275+
}
276+
}
277+
278+
// lo is now the first code with decoded >= value.
279+
if (lo == 0) [[unlikely]] {
280+
return 0;
281+
}
282+
283+
uint8_t above_code = lo;
284+
uint8_t below_code = lo - 1;
285+
286+
uint32_t above_value = decodeTailLength(above_code);
287+
uint32_t below_value = decodeTailLength(below_code);
288+
289+
uint32_t above_error = above_value - value;
290+
uint32_t below_error = value - below_value;
291+
292+
// Tie-break downward.
293+
if (below_error <= above_error) {
294+
return below_code;
295+
} else {
296+
return above_code;
297+
}
298+
}
170299
};
171300

172301
// This is an index struct to access TPC clusters inside sectors and rows. It shall not own the data, but just point to

GPU/GPUTracking/DataCompression/GPUTPCClusterStatistics.cxx

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,9 +128,11 @@ void GPUTPCClusterStatistics::RunStatistics(const o2::tpc::ClusterNativeAccess*
128128
tmpClusters[k] = clustersNative->clusters[i][j][k];
129129
if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) {
130130
GPUTPCCompression::truncateSignificantBitsChargeMax(tmpClusters[k].qMax, param);
131-
GPUTPCCompression::truncateSignificantBitsCharge(tmpClusters[k].qTot, param);
132131
GPUTPCCompression::truncateSignificantBitsWidth(tmpClusters[k].sigmaPadPacked, param);
133-
GPUTPCCompression::truncateSignificantBitsWidth(tmpClusters[k].sigmaTimePacked, param);
132+
if (!tmpClusters[k].isSaturated()) {
133+
GPUTPCCompression::truncateSignificantBitsCharge(tmpClusters[k].qTot, param);
134+
GPUTPCCompression::truncateSignificantBitsWidth(tmpClusters[k].sigmaTimePacked, param);
135+
}
134136
}
135137
}
136138
std::sort(tmpClusters.begin(), tmpClusters.end());

GPU/GPUTracking/DataCompression/GPUTPCCompression.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ class GPUTPCCompression : public GPUProcessor
4747
#endif
4848

4949
static constexpr uint32_t P_MAX_QMAX = 1 << 10;
50-
static constexpr uint32_t P_MAX_QTOT = 5 * 5 * P_MAX_QMAX;
50+
static constexpr uint32_t P_MAX_QTOT = 1 << 16;
5151
static constexpr uint32_t P_MAX_TIME = 1 << 24;
5252
static constexpr uint32_t P_MAX_PAD = 1 << 16;
5353
static constexpr uint32_t P_MAX_SIGMA = 1 << 8;

GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,9 +121,11 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step0at
121121
uint8_t sigmapad = orgCl.sigmaPadPacked, sigmatime = orgCl.sigmaTimePacked;
122122
if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) {
123123
compressor.truncateSignificantBitsChargeMax(qmax, param);
124-
compressor.truncateSignificantBitsCharge(qtot, param);
125124
compressor.truncateSignificantBitsWidth(sigmapad, param);
126-
compressor.truncateSignificantBitsWidth(sigmatime, param);
125+
if (!orgCl.isSaturated()) {
126+
compressor.truncateSignificantBitsCharge(qtot, param);
127+
compressor.truncateSignificantBitsWidth(sigmatime, param);
128+
}
127129
}
128130
c.qTotA[cidx] = qtot;
129131
c.qMaxA[cidx] = qmax;
@@ -298,9 +300,11 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
298300
uint8_t sigmapad = orgCl.sigmaPadPacked, sigmatime = orgCl.sigmaTimePacked;
299301
if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) {
300302
compressor.truncateSignificantBitsChargeMax(qmax, param);
301-
compressor.truncateSignificantBitsCharge(qtot, param);
302303
compressor.truncateSignificantBitsWidth(sigmapad, param);
303-
compressor.truncateSignificantBitsWidth(sigmatime, param);
304+
if (!orgCl.isSaturated()) {
305+
compressor.truncateSignificantBitsCharge(qtot, param);
306+
compressor.truncateSignificantBitsWidth(sigmatime, param);
307+
}
304308
}
305309
c.qTotU[outidx] = qtot;
306310
c.qMaxU[outidx] = qmax;

GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -529,11 +529,12 @@ GPUd() void GPUTPCCFHIPClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
529529
const float firstWeight = tail->qTot;
530530
const float firstPad = tail->pad;
531531
const float firstTime = HIPTailTimeMean(*tail);
532-
const float firstTimeVariance = HIPTailTimeVariance(*tail);
533532
float padSum = firstWeight * firstPad;
534533
float padSqSum = firstWeight * firstPad * firstPad;
535534
float timeSum = firstWeight * firstTime;
536-
float timeSqSum = firstWeight * (firstTime * firstTime + firstTimeVariance);
535+
536+
uint32_t tailStart = tail->tailStart;
537+
uint32_t tailEnd = tail->tailEnd;
537538

538539
while (tail->iNext != 0) {
539540

@@ -542,28 +543,27 @@ GPUd() void GPUTPCCFHIPClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
542543
const float tailWeight = tail->qTot;
543544
const float tailPad = tail->pad;
544545
const float tailTime = HIPTailTimeMean(*tail);
545-
const float tailTimeVariance = HIPTailTimeVariance(*tail);
546546
qMax = CAMath::Max(qMax, tail->qMax);
547547
qTot += tail->qTot;
548548
padSum += tailWeight * tailPad;
549549
padSqSum += tailWeight * tailPad * tailPad;
550550
timeSum += tailWeight * tailTime;
551-
timeSqSum += tailWeight * (tailTime * tailTime + tailTimeVariance);
551+
tailStart = CAMath::Min<uint32_t>(tailStart, tail->tailStart);
552+
tailEnd = CAMath::Max<uint32_t>(tailEnd, tail->tailEnd);
552553
}
553554

554555
const float weightSum = CAMath::Max(qTot, 1.f);
555556
float padMean = padSum / weightSum;
556557
float timeMean = timeSum / weightSum; // TODO: Use timebin of saturated signal instead! Time mean is biased for long tails.
557558
float padSigma = CAMath::Sqrt(CAMath::Max(0.f, padSqSum / weightSum - padMean * padMean));
558-
float timeSigma = CAMath::Sqrt(CAMath::Max(0.f, timeSqSum / weightSum - timeMean * timeMean));
559559

560560
tpc::ClusterNative cn;
561561
cn.qMax = qMax;
562-
cn.qTot = (uint16_t)CAMath::Min(qTot, 65535.f);
562+
cn.setSaturatedQtot(qTot);
563+
cn.setSaturatedTailLength(tailEnd - tailStart);
563564
float clusterTime = fragment.start + timeMean - clusterer.Param().rec.tpc.clustersShiftTimebinsClusterizer;
564565
cn.setTimeFlags(clusterTime, 0);
565566
cn.setPad(padMean);
566-
cn.setSigmaTime(timeSigma);
567567
cn.setSigmaPad(padSigma);
568568

569569
if (cn.qMax >= 1023) {

GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,13 @@ void GPUTPCClusterFinder::DumpClusters(std::ostream& out)
166166

167167
out << "Row: " << i << ": " << N << "\n";
168168
for (const auto& cl : sortedCluster) {
169-
out << std::hex << cl.timeFlagsPacked << std::dec << " " << cl.padPacked << " " << int32_t{cl.sigmaTimePacked} << " " << int32_t{cl.sigmaPadPacked} << " " << cl.qMax << " " << cl.qTot << "\n";
169+
uint32_t qTot = cl.qTot;
170+
uint32_t sigmaTime = cl.sigmaTimePacked;
171+
if (cl.isSaturated()) {
172+
qTot = cl.getSaturatedQtot();
173+
sigmaTime = cl.getSaturatedTailLength();
174+
}
175+
out << std::hex << cl.timeFlagsPacked << std::dec << " " << cl.padPacked << " " << sigmaTime << " " << int32_t{cl.sigmaPadPacked} << " " << cl.qMax << " " << qTot << "\n";
170176
}
171177
}
172178
}

0 commit comments

Comments
 (0)