From 1a59c03f282a86dc22c6daf8a271b1d834da2c49 Mon Sep 17 00:00:00 2001
From: Chris Bieneman <chris.bieneman@me.com>
Date: Fri, 19 Jun 2026 16:12:07 -0500
Subject: [PATCH] [GVN] Don't coerce vector store via i128+

GVN forwards a wide stored value to a narrow load from the same addres
by bitcasting the stored value to an integer, then trucating to the
loaded value's size. This avoids re-loading a value that was just
stored.

For DXIL this is unsafe if the wider size is greater than 64-bits since
DXIL doesn't allow integers larger than 64-bits. This change disables
coercing in this case. This shoudln't cause any performance regressions
in practice because no existing cases can generate valid DXIL of this
form, but it does generate less optimal final output.

We could consider more robust load-store optimizations and coersion to
integer vectors as an alternative, but at this time the important thing
is to make DXC not generate invalid DXIL, which this simplified change
does.

Assisted by Claude Opus 4.7
---
 lib/Transforms/Scalar/GVN.cpp                 |  9 ++++
 .../GVN/no-large-int-vector-coercion.ll       | 49 +++++++++++++++++++
 2 files changed, 58 insertions(+)
 create mode 100644 tools/clang/test/DXC/Passes/GVN/no-large-int-vector-coercion.ll
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 3436359d20..2a170795bb 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -862,6 +862,15 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
     return false;
   if (LoadPrimBits && DL.getTypeSizeInBits(LoadTy) != LoadPrimBits)
     return false;
+
+  // Reject coercions that require bitcasting a non-integer value (e.g. a native
+  // vector) to an integer wider than 64 bits. DXIL does not support integer
+  // types wider than 64 bits, so such coercions would produce invalid DXIL.
+  uint64_t StoredValBits = DL.getTypeSizeInBits(StoredValTy);
+  uint64_t LoadBits = DL.getTypeSizeInBits(LoadTy);
+  if (StoredValBits > 64 && !StoredValTy->isIntegerTy() &&
+      (StoredValBits != LoadBits || LoadTy->isIntegerTy()))
+    return false;
   // HLSL Change End
 
   // The store has to be at least as big as the load.
diff --git a/tools/clang/test/DXC/Passes/GVN/no-large-int-vector-coercion.ll b/tools/clang/test/DXC/Passes/GVN/no-large-int-vector-coercion.ll
new file mode 100644
index 0000000000..767c2251be
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/GVN/no-large-int-vector-coercion.ll
@@ -0,0 +1,49 @@
+; RUN: %dxopt %s -hlsl-passes-resume -gvn -S | FileCheck %s
+
+; Regression test: when a load is fed by a wider vector store at the same
+; address, GVN previously coerced the value by bit-casting the vector to a
+; same-sized integer (e.g. <4 x i32> -> i128) and truncating. DXIL does not
+; support integer widths greater than 64 bits, so the resulting module
+; failed validation with "Int type 'i128' has an invalid width."
+;
+; This test ensures GVN does not introduce a bitcast to an oversized
+; integer when forwarding the wider vector value to a narrower scalar load.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+; CHECK-LABEL: @test_v4i32_to_i32
+; CHECK-NOT: i128
+; CHECK-NOT: i256
+; CHECK: ret i32
+define i32 @test_v4i32_to_i32(<4 x i32>* %p) {
+entry:
+  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32>* %p
+  %sp = bitcast <4 x i32>* %p to i32*
+  %v = load i32, i32* %sp
+  ret i32 %v
+}
+
+; CHECK-LABEL: @test_v8i32_to_i32
+; CHECK-NOT: i256
+; CHECK: ret i32
+define i32 @test_v8i32_to_i32(<8 x i32>* %p, <8 x i32> %vec) {
+entry:
+  store <8 x i32> %vec, <8 x i32>* %p
+  %sp = bitcast <8 x i32>* %p to i32*
+  %v = load i32, i32* %sp
+  ret i32 %v
+}
+
+; CHECK-LABEL: @test_v2i32_to_i32
+; The <2 x i32> case is 64 bits wide, which is a legal DXIL integer, so
+; coercion here is fine. We just make sure no i128 is introduced.
+; CHECK-NOT: i128
+; CHECK: ret i32
+define i32 @test_v2i32_to_i32(<2 x i32>* %p, <2 x i32> %vec) {
+entry:
+  store <2 x i32> %vec, <2 x i32>* %p
+  %sp = bitcast <2 x i32>* %p to i32*
+  %v = load i32, i32* %sp
+  ret i32 %v
+}