diff --git a/.gitmodules b/.gitmodules index 350885ac54..4afb3bad23 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,3 +5,6 @@ [submodule "external/json-schema-validator"] path = external/json-schema-validator url = https://github.com/pboettch/json-schema-validator.git +[submodule "external/CRoaring"] + path = external/CRoaring + url = https://github.com/fabianbs96/CRoaring.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 0bb9701c17..7579e6292d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,7 +74,9 @@ set(RELEASE_CONFIGURATIONS RELWITHDEBINFO RELEASE CACHE INTERNAL "" FORCE) string(APPEND CMAKE_CXX_FLAGS " -MP -fstack-protector-strong -ffunction-sections -fdata-sections -pipe") string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-omit-frame-pointer") +string(APPEND CMAKE_C_FLAGS_DEBUG " -fno-omit-frame-pointer") string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -fno-omit-frame-pointer") +string(APPEND CMAKE_C_FLAGS_RELWITHDEBINFO " -fno-omit-frame-pointer") string(APPEND CMAKE_CXX_FLAGS_RELEASE "") option(CMAKE_VISIBILITY_INLINES_HIDDEN "Hide inlined functions from the DSO table (default ON)" ON) @@ -123,6 +125,7 @@ if (NOT "${PHASAR_TARGET_ARCH_INTERNAL}" STREQUAL "") if (MARCH_SUPPORTED) message(STATUS "Target architecture '${PHASAR_TARGET_ARCH_INTERNAL}' enabled") string(APPEND CMAKE_CXX_FLAGS_RELEASE " -march=${PHASAR_TARGET_ARCH_INTERNAL}") + string(APPEND CMAKE_C_FLAGS_RELEASE " -march=${PHASAR_TARGET_ARCH_INTERNAL}") else() message(WARNING "Target architecture '${PHASAR_TARGET_ARCH_INTERNAL}' not supported. Fallback to generic build") endif() @@ -339,6 +342,15 @@ set(PHASAR_LLVM_VERSION 16 CACHE STRING "The LLVM major-version that PhASAR shou include(add_llvm) add_llvm() +# Roaring + +find_package(roaring QUIET) +if(NOT TARGET roaring::roaring) + set(ENABLE_ROARING_TESTS OFF) + add_subdirectory(external/CRoaring) + set(PHASAR_PROVIDE_CROARING ON) +endif() + # SVF option(PHASAR_USE_SVF "Use SVF for more options in alias analysis (default is OFF)" OFF) if(PHASAR_USE_SVF) diff --git a/Config.cmake.in b/Config.cmake.in index 085a277031..ffdc52fbaa 100644 --- a/Config.cmake.in +++ b/Config.cmake.in @@ -15,6 +15,14 @@ set(PHASAR_USE_LLVM_FAT_LIB @USE_LLVM_FAT_LIB@) set(PHASAR_BUILD_DYNLIB @PHASAR_BUILD_DYNLIB@) set(PHASAR_USE_Z3 @PHASAR_USE_Z3@) set(PHASAR_BUILD_MODULES @PHASAR_BUILD_MODULES@) +set(PHASAR_PROVIDE_CROARING @PHASAR_PROVIDE_CROARING@) + +if (PHASAR_PROVIDE_CROARING) + # TODO: Is that path portable? + include("${CMAKE_CURRENT_LIST_DIR}/../roaring/roaring-targets.cmake") +else() + find_dependency(roaring) +endif() if (PHASAR_USE_Z3) find_dependency(Z3 REQUIRED) diff --git a/external/CRoaring b/external/CRoaring new file mode 160000 index 0000000000..5505f1bf1a --- /dev/null +++ b/external/CRoaring @@ -0,0 +1 @@ +Subproject commit 5505f1bf1a62d9e7adad798b418ce873ddff7b1d diff --git a/include/phasar/PhasarLLVM/ControlFlow/Resolver/Resolver.h b/include/phasar/PhasarLLVM/ControlFlow/Resolver/Resolver.h index b2cba8ae3a..fbbc70d21f 100644 --- a/include/phasar/PhasarLLVM/ControlFlow/Resolver/Resolver.h +++ b/include/phasar/PhasarLLVM/ControlFlow/Resolver/Resolver.h @@ -18,11 +18,11 @@ #define PHASAR_PHASARLLVM_CONTROLFLOW_RESOLVER_RESOLVER_H_ #include "phasar/PhasarLLVM/Pointer/LLVMAliasInfo.h" +#include "phasar/PhasarLLVM/Utils/VirtualCallUtils.h" #include "phasar/Utils/MaybeUniquePtr.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/IR/DerivedTypes.h" #include #include @@ -41,15 +41,6 @@ class LLVMVFTableProvider; class DIBasedTypeHierarchy; enum class CallGraphAnalysisType; -/// Assuming that `CallSite` is a virtual call through a vtable, retrieves the -/// index in the vtable of the virtual function called. -[[nodiscard]] std::optional -getVFTIndex(const llvm::CallBase *CallSite); - -/// Similar to getVFTIndex(), but also returns a pointer to the vtable -[[nodiscard]] std::optional> -getVFTIndexAndVT(const llvm::CallBase *CallSite); - /// Assuming that `CallSite` is a call to a non-static member function, /// retrieves the type of the receiver. Returns nullptr, if the receiver-type /// could not be extracted @@ -64,12 +55,6 @@ getReceiverType(const llvm::CallBase *CallSite); [[nodiscard]] std::string getReceiverTypeName(const llvm::CallBase *CallSite); -/// Checks whether the signature of `DestFun` matches the required withature of -/// `CallSite`, such that `DestFun` qualifies as callee-candidate, if `CallSite` -/// is an indirect/virtual call. -[[nodiscard]] bool isConsistentCall(const llvm::CallBase *CallSite, - const llvm::Function *DestFun); - [[nodiscard]] bool isVirtualCall(const llvm::Instruction *Inst, const LLVMVFTableProvider &VTP); diff --git a/include/phasar/PhasarLLVM/Pointer.h b/include/phasar/PhasarLLVM/Pointer.h index be473838bc..336d28a97c 100644 --- a/include/phasar/PhasarLLVM/Pointer.h +++ b/include/phasar/PhasarLLVM/Pointer.h @@ -12,10 +12,16 @@ #include "phasar/Config/phasar-config.h" // for PHASAR_USE_SVF #include "phasar/PhasarLLVM/Pointer/AliasAnalysisView.h" +#include "phasar/PhasarLLVM/Pointer/AndersenOTFAA.h" #include "phasar/PhasarLLVM/Pointer/FilteredLLVMAliasSet.h" #include "phasar/PhasarLLVM/Pointer/LLVMAliasInfo.h" #include "phasar/PhasarLLVM/Pointer/LLVMAliasSet.h" +#include "phasar/PhasarLLVM/Pointer/LLVMGlobalInitCache.h" +#include "phasar/PhasarLLVM/Pointer/LLVMPointsToInfo.h" #include "phasar/PhasarLLVM/Pointer/LLVMPointsToUtils.h" +#include "phasar/PhasarLLVM/Pointer/LLVMUnionFindAA.h" +#include "phasar/PhasarLLVM/Pointer/LLVMUnionFindAliasSet.h" +#include "phasar/PhasarLLVM/Pointer/MemSSAUtils.h" #ifdef PHASAR_USE_SVF #include "phasar/PhasarLLVM/Pointer/SVF/SVFPointsToSet.h" diff --git a/include/phasar/PhasarLLVM/Pointer/AndersenOTFAA.h b/include/phasar/PhasarLLVM/Pointer/AndersenOTFAA.h new file mode 100644 index 0000000000..8b718379de --- /dev/null +++ b/include/phasar/PhasarLLVM/Pointer/AndersenOTFAA.h @@ -0,0 +1,113 @@ +#pragma once + +/****************************************************************************** + * Copyright (c) 2026 Fabian Schiebel. + * All rights reserved. This program and the accompanying materials are made + * available under the terms of LICENSE.txt. + * + * Contributors: + * Fabian Schiebel and others + *****************************************************************************/ + +#include "phasar/PhasarLLVM/ControlFlow/LLVMBasedCallGraph.h" +#include "phasar/PhasarLLVM/Pointer/LLVMPointerAssignmentGraph.h" +#include "phasar/PhasarLLVM/Pointer/LLVMUnionFindAA.h" +#include "phasar/Pointer/RawAliasSet.h" +#include "phasar/Pointer/UnionFindAA.h" +#include "phasar/Utils/MaybeUniquePtr.h" +#include "phasar/Utils/NonNullPtr.h" +#include "phasar/Utils/Soundness.h" +#include "phasar/Utils/TypedVector.h" +#include "phasar/Utils/ValueCompressor.h" + +#include "llvm/ADT/ArrayRef.h" + +namespace llvm { +class Function; +} // namespace llvm + +namespace psr { + +class LLVMProjectIRDB; + +/// Alias-analysis result for the Andersen-style OTF points-to analysis. +/// +/// Two values may-alias iff their points-to sets share at least one abstract +/// object. Satisfies \c UnionFindAAResult so it can be wrapped by +/// \c LLVMUnionFindAliasIterator. +struct AndersenOTFResult { + TypedVector> AliasSets; + LLVMBasedCallGraph CG; + + [[nodiscard]] static constexpr bool isCached() noexcept { return true; } + [[nodiscard]] constexpr size_t size() const noexcept { + return AliasSets.size(); + } + + [[nodiscard]] RawAliasSet + getRawAliasSet(ValueId Var) const noexcept { + if (!AliasSets.inbounds(Var)) { + return {}; + } + return AliasSets[Var]; + } + + [[nodiscard]] bool mayAlias(ValueId Var1, ValueId Var2) const noexcept { + if (Var1 == Var2) { + return true; + } + if (!AliasSets.inbounds(Var1)) { + return false; + } + return AliasSets[Var1].contains(Var2); + } +}; + +static_assert(UnionFindAAResult); + +/// Andersen-style inclusion-based points-to analysis that co-refines the call +/// graph and points-to sets in a single fixpoint. +/// +/// Unlike the staged pipeline (resolver → PA), this solver owns its own +/// function-worklist loop: direct calls add callees immediately; indirect +/// calls are resolved as \c pts(fp) grows. +/// +/// Phase 1: context- and field-insensitive. +class AndersenOTFSolver { +public: + explicit AndersenOTFSolver(const LLVMProjectIRDB &IRDB, + llvm::ArrayRef Entries, + ValueCompressor &VC, + Soundness S = Soundness::Soundy) noexcept; + + /// Run the full OTF fixpoint and return the alias-analysis result. + [[nodiscard]] AndersenOTFResult solve(); + +private: + struct SolverData; + + NonNullPtr IRDB; + llvm::ArrayRef Entries; + NonNullPtr> VC; + Soundness S; +}; + +// ---- Factory functions ------------------------------------------------ + +/// Runs the Andersen OTF fixpoint and returns the raw alias-analysis result +/// (no LLVM-value wrapping). If \p VC is null, a fresh one is allocated. +[[nodiscard]] AndersenOTFResult +computeAndersenOTFRaw(const LLVMProjectIRDB &IRDB, + llvm::ArrayRef EntryPoints, + MaybeUniquePtr> VC = nullptr, + Soundness S = Soundness::Soundy); + +/// Runs the Andersen OTF fixpoint and returns an \c LLVMUnionFindAliasIterator +/// that implements \c IsLLVMAliasIterator. +[[nodiscard]] LLVMUnionFindAliasIterator +computeAndersenOTF(const LLVMProjectIRDB &IRDB, + llvm::ArrayRef EntryPoints, + MaybeUniquePtr> VC = nullptr, + Soundness S = Soundness::Soundy); + +} // namespace psr diff --git a/include/phasar/PhasarLLVM/Pointer/LLVMGlobalInitCache.h b/include/phasar/PhasarLLVM/Pointer/LLVMGlobalInitCache.h new file mode 100644 index 0000000000..59ec22d22c --- /dev/null +++ b/include/phasar/PhasarLLVM/Pointer/LLVMGlobalInitCache.h @@ -0,0 +1,91 @@ +#pragma once + +/****************************************************************************** + * Copyright (c) 2026 Fabian Schiebel. + * All rights reserved. This program and the accompanying materials are made + * available under the terms of LICENSE.txt. + * + * Contributors: + * Fabian Schiebel and others + *****************************************************************************/ + +#include "phasar/PhasarLLVM/Utils/LLVMShorthands.h" +#include "phasar/Utils/ValueCompressor.h" + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Operator.h" +#include "llvm/Support/Casting.h" + +#include +#include + +namespace psr { + +/// Memoised walker for global-variable pointer initializers. +/// +/// Traverses a \c llvm::Constant initializer and collects the \c ValueId of +/// every pointer-typed sub-constant it contains (direct pointer, GEP base, +/// or pointer elements of an aggregate). Results are cached so shared +/// sub-expressions are not revisited. +/// +/// Create one instance per analysis run; it is tied to a single +/// \c ValueCompressor via the \p GetVar callback. +struct GlobalInitCache { + std::unordered_map> + Cache; + + /// Returns the \c ValueId slice for all pointer-typed constants reachable + /// from \p Const. \p GetVar maps an \c llvm::Value* to a \c ValueId + /// (typically \c getOrInsertVar). + template GetVarFn> + [[nodiscard]] llvm::ArrayRef getOrCreate(const llvm::Constant *Const, + GetVarFn &&GetVar) { + if (definitelyContainsNoPointer(Const)) { + return {}; + } + + auto [It, Inserted] = Cache.try_emplace(Const); + if (!Inserted) { + return It->second; + } + auto &Vec = It->second; + + if (llvm::isa(Const)) { + return {}; + } + + if (const auto *CGep = llvm::dyn_cast(Const)) { + // TODO: Properly handle constant GEPs + return getOrCreate(llvm::cast(CGep->getPointerOperand()), + GetVar); + } + + if (Const->getType()->isPointerTy()) { + Vec.push_back(std::invoke(GetVar, Const)); + return Vec; + } + + // TODO: Get rid of the recursion + + if (const auto *Agg = llvm::dyn_cast(Const)) { + if (Agg->getType()->isArrayTy() && + definitelyContainsNoPointer(Agg->getType()->getArrayElementType())) { + return {}; + } + for (size_t I = 0, N = Agg->getNumOperands(); I < N; ++I) { + const auto *Elem = llvm::cast( + Agg->getAggregateElement(I)->stripPointerCastsAndAliases()); + auto Sub = getOrCreate(Elem, GetVar); + Vec.append(Sub.begin(), Sub.end()); + } + } + + // TODO: more + + return Vec; + } +}; + +} // namespace psr diff --git a/include/phasar/PhasarLLVM/Pointer/MemSSAUtils.h b/include/phasar/PhasarLLVM/Pointer/MemSSAUtils.h new file mode 100644 index 0000000000..f9fd5aa5c9 --- /dev/null +++ b/include/phasar/PhasarLLVM/Pointer/MemSSAUtils.h @@ -0,0 +1,55 @@ +#pragma once + +/****************************************************************************** + * Copyright (c) 2026 Fabian Schiebel. + * All rights reserved. This program and the accompanying materials are made + * available under the terms of LICENSE.txt. + * + * Contributors: + * Fabian Schiebel and others + *****************************************************************************/ + +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/ScopedNoAliasAA.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TypeBasedAliasAnalysis.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" + +namespace psr { + +// Bundle of per-function analyses for the built-in MemorySSA provider. +// Members are declared in initialization order: each field depends only on +// the ones before it. +struct MemSSABundle { + llvm::AssumptionCache AC; + llvm::DominatorTree DT; + llvm::TypeBasedAAResult TBAA; + llvm::ScopedNoAliasAAResult SNA; + llvm::BasicAAResult BAA; + llvm::AAResults AA; + llvm::MemorySSA MSSA; + + explicit MemSSABundle(llvm::Function &F, const llvm::TargetLibraryInfo *TLI); +}; + +/// Walks the MemorySSA def chain rooted at MA, collecting all StoreInst +/// reaching definitions into ReachingDefs. +/// Returns true if a LiveOnEntry def is reachable (value may come from outside +/// the function). In that case, ReachingDefs may be incompletely populated. +[[nodiscard]] bool collectReachingDefs( + llvm::MemoryAccess *MA, const llvm::MemorySSA &MSSA, + llvm::SmallPtrSetImpl &ReachingDefs, + llvm::SmallPtrSetImpl &Visited); + +/// Collects all store instructions that may define the value loaded from the +/// given load. Forwards to the above collectReachingDefs overload. +[[nodiscard]] bool collectReachingDefs( + const llvm::LoadInst *Load, llvm::MemorySSA &MSSA, + llvm::SmallPtrSetImpl &ReachingDefs); + +} // namespace psr diff --git a/include/phasar/PhasarLLVM/Utils/LLVMShorthands.h b/include/phasar/PhasarLLVM/Utils/LLVMShorthands.h index 0377bd5e83..19630bd793 100644 --- a/include/phasar/PhasarLLVM/Utils/LLVMShorthands.h +++ b/include/phasar/PhasarLLVM/Utils/LLVMShorthands.h @@ -19,12 +19,18 @@ #include "phasar/Utils/Utilities.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/GlobalObject.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Type.h" #include "llvm/Support/Casting.h" +#include +#include #include #include @@ -301,6 +307,40 @@ definitelyContainsNoPointer(const llvm::Type *Ty) noexcept { definitelyContainsNoPointer(Val->getType()); } +/// Strips pointer-cast and alias wrappers from \p V, then invokes \p Handler +/// for each concrete underlying value: +/// - If \p V is not a ConstantExpr after stripping, Handler is called once +/// with the stripped value. +/// - If \p V is a ConstantExpr, the expression tree is walked and Handler +/// is called for each GlobalObject leaf. +template HandlerT> +void forEachPointerOperand(const llvm::Value *V, HandlerT Handler) { + V = V->stripPointerCastsAndAliases(); + const auto *CExpr = llvm::dyn_cast(V); + if (!CExpr) [[likely]] { + std::invoke(Handler, V); + return; + } + + llvm::SmallPtrSet Seen = {V}; + llvm::SmallVector WL = {CExpr}; + do { + const auto *Curr = WL.pop_back_val(); + for (const auto *Op : Curr->operand_values()) { + if (definitelyContainsNoPointer(Op) || !Seen.insert(Op).second) { + continue; + } + if (const auto *GObj = llvm::dyn_cast(Op)) { + std::invoke(Handler, static_cast(GObj)); + continue; + } + if (const auto *OpUser = llvm::dyn_cast(Op)) { + WL.push_back(OpUser); + } + } + } while (!WL.empty()); +} + /// Approximates, whether the given LLVM value may be address-taken, i.e., /// whether its pointer value is used for other purposes than just /// store/load/gep. @@ -372,6 +412,17 @@ getVaListTagOrNull(const llvm::Function &Fun); [[nodiscard]] bool isVaListAlloca(const llvm::AllocaInst &Alloc); [[nodiscard]] const llvm::DIType *stripPointerTypes(const llvm::DIType *DITy); + +/// Walk a constant initializer along a GEP index path and return the +/// \c Function* at the leaf, or nullptr. +/// +/// \p Indices mirrors GEP index semantics: +/// - \c Indices[0] is the outer "pointer-array" index: +/// \c ConstantArray -> selects the element; \c ConstantStruct -> +/// must be 0 (pointer-arithmetic no-op, struct is not an array). +/// - \c Indices[1+] navigate recursively through ConstantAggregate. +[[nodiscard]] const llvm::Function * +walkConstInitPath(const llvm::Constant *Init, llvm::ArrayRef Indices); } // namespace psr #endif diff --git a/include/phasar/PhasarLLVM/Utils/VirtualCallUtils.h b/include/phasar/PhasarLLVM/Utils/VirtualCallUtils.h new file mode 100644 index 0000000000..9cd766ff74 --- /dev/null +++ b/include/phasar/PhasarLLVM/Utils/VirtualCallUtils.h @@ -0,0 +1,52 @@ +#pragma once + +/****************************************************************************** + * Copyright (c) 2026 Fabian Schiebel. + * All rights reserved. This program and the accompanying materials are made + * available under the terms of LICENSE.txt. + * + * Contributors: + * Fabian Schiebel and others + *****************************************************************************/ + +#include "llvm/ADT/SmallVector.h" + +#include +#include +#include + +namespace llvm { +class CallBase; +class Value; +class Type; +class Function; +} // namespace llvm + +namespace psr { + +/// Assuming that `CallSite` is a virtual call through a vtable, retrieves the +/// index in the vtable of the virtual function called. +[[nodiscard]] std::optional +getVFTIndex(const llvm::CallBase *CallSite); + +/// Similar to getVFTIndex(), but also returns a pointer to the vtable +[[nodiscard]] std::optional> +getVFTIndexAndVT(const llvm::CallBase *CallSite); + +/// Detects the pattern \c call(load(GEP(base, const_indices...))) with a +/// typed (>=3-operand) GEP, i.e. an indirect call through a struct function +/// pointer field. Distinct from the 2-operand raw-pointer C++ vptr case +/// handled by \c getVFTIndexAndVT. +/// +/// Returns \c {base_ptr, all_GEP_indices, gep_source_elem_ty} on match, +/// or \c std::nullopt otherwise. +[[nodiscard]] std::optional, llvm::Type *>> +getStructVCallInfo(const llvm::CallBase *CallSite); + +/// Checks whether the signature of `DestFun` matches the required withature of +/// `CallSite`, such that `DestFun` qualifies as callee-candidate, if `CallSite` +/// is an indirect/virtual call. +[[nodiscard]] bool isConsistentCall(const llvm::CallBase *CallSite, + const llvm::Function *DestFun); +} // namespace psr diff --git a/include/phasar/Pointer/AliasAnalysisType.def b/include/phasar/Pointer/AliasAnalysisType.def index 258819b378..c7359ff64c 100644 --- a/include/phasar/Pointer/AliasAnalysisType.def +++ b/include/phasar/Pointer/AliasAnalysisType.def @@ -16,6 +16,7 @@ ALIAS_ANALYSIS_TYPE(CFLSteens, "cflsteens", "Steensgaard-style alias analysis (e ALIAS_ANALYSIS_TYPE(CFLAnders, "cflanders", "Andersen-style alias analysis (subset-based) (default)") ALIAS_ANALYSIS_TYPE(PointsTo, "points-to", "Alias-information based on (external) points-to information") ALIAS_ANALYSIS_TYPE(UnionFind, "union-find", "Steensgaard-style alias analysis based on union-find structures") +ALIAS_ANALYSIS_TYPE(AndersenOTF, "andersen-otf", "Andersen-style inclusion-based on-the-fly points-to analysis") #ifdef PHASAR_USE_SVF ALIAS_ANALYSIS_TYPE(SVFDDA, "svf-dda", "Alias-information based on SVF's ContextDDA analysis. Requires SVF.") diff --git a/include/phasar/Pointer/RawAliasSet.h b/include/phasar/Pointer/RawAliasSet.h index 2aa70f0f31..f92994230d 100644 --- a/include/phasar/Pointer/RawAliasSet.h +++ b/include/phasar/Pointer/RawAliasSet.h @@ -10,10 +10,15 @@ *****************************************************************************/ #include "phasar/Utils/TypeTraits.h" +#include "phasar/Utils/Utilities.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SparseBitVector.h" +#include "roaring/roaring.hh" + #include +#include namespace psr { @@ -33,16 +38,30 @@ concept IsRawAliasSet = requires(ASet &MutSet, const ASet &ConstSet, { ConstSet.contains(ValId) } -> std::convertible_to; // ConstSet.begin(); // ConstSet.end(); + + /// Iteration must be in ascending order ConstSet.foreach (DummyFn{}); MutSet |= ConstSet; MutSet &= ConstSet; MutSet -= ConstSet; + { ConstSet - ConstSet } -> std::convertible_to; { ConstSet == ConstSet } noexcept -> std::convertible_to; { ConstSet != ConstSet } noexcept -> std::convertible_to; { MutSet.tryMergeWith(ConstSet) } -> std::convertible_to; { MutSet.clear() } noexcept; { ConstSet.empty() } noexcept -> std::convertible_to; { ConstSet.size() } noexcept -> std::convertible_to; + { + // Merges the ConstSet into MutSet, as with tryMergeWith, but invokes a + // callback for each element that was newly inserted.The Diff will be + // materialized and merged into that out-param + MutSet.mergeWithDiff(ConstSet, DummyFn{}, MutSet) + } -> std::convertible_to; + { + // Merges the ConstSet into MutSet, as with tryMergeWith, but invokes a + // callback for each element that was newly inserted. + MutSet.mergeWithDiff(ConstSet, DummyFn{}) + } -> std::convertible_to; }; /// Sparse bit-set used to represent alias sets in union-find analyses. @@ -52,11 +71,11 @@ concept IsRawAliasSet = requires(ASet &MutSet, const ASet &ConstSet, /// Satisfies \c IsRawAliasSet. /// /// \tparam IdT Integer-like id type (e.g., \c ValueId). -template class RawAliasSet { +template class LLVMRawAliasSet { public: using value_type = IdT; - RawAliasSet() = default; + LLVMRawAliasSet() = default; void insert(IdT Id) { Bits.set(uint32_t(Id)); } @@ -66,19 +85,32 @@ template class RawAliasSet { [[nodiscard]] bool contains(IdT Id) const { return Bits.test(uint32_t(Id)); } - LLVM_ATTRIBUTE_ALWAYS_INLINE void foreach ( - std::invocable auto Handler) const { + template HandlerFn> + LLVM_ATTRIBUTE_ALWAYS_INLINE void foreach (HandlerFn Handler) const { for (auto Bit : Bits) { - std::invoke(Handler, IdT(Bit)); + if constexpr (std::convertible_to, + bool>) { + if (!std::invoke(Handler, IdT(Bit))) { + break; + } + } else { + std::invoke(Handler, IdT(Bit)); + } } } - void operator|=(const RawAliasSet &Other) { Bits |= Other.Bits; } - void operator&=(const RawAliasSet &Other) { Bits &= Other.Bits; } - void operator-=(const RawAliasSet &Other) { + void operator|=(const LLVMRawAliasSet &Other) { Bits |= Other.Bits; } + void operator&=(const LLVMRawAliasSet &Other) { Bits &= Other.Bits; } + void operator-=(const LLVMRawAliasSet &Other) { Bits.intersectWithComplement(Other.Bits); } + [[nodiscard]] LLVMRawAliasSet operator-(const LLVMRawAliasSet &Other) const { + LLVMRawAliasSet Ret; + Ret.Bits = Bits - Other.Bits; + return Ret; + } + [[nodiscard]] bool empty() const noexcept { return Bits.empty(); } [[nodiscard]] size_t size() const noexcept { return Bits.count(); } @@ -87,18 +119,180 @@ template class RawAliasSet { [[nodiscard]] auto begin() const noexcept { return Bits.begin(); } [[nodiscard]] auto end() const noexcept { return Bits.end(); } - [[nodiscard]] bool tryMergeWith(const RawAliasSet &Other) { + [[nodiscard]] bool tryMergeWith(const LLVMRawAliasSet &Other) { return Bits |= Other.Bits; } void erase(IdT Id) { Bits.reset(uint32_t(Id)); } - [[nodiscard]] bool operator==(const RawAliasSet &Other) const noexcept { + [[nodiscard]] bool operator==(const LLVMRawAliasSet &Other) const noexcept { return Bits == Other.Bits; } + bool mergeWithDiff(const LLVMRawAliasSet &Other, + std::invocable auto WithNewElem, + LLVMRawAliasSet &IntoDiff) { + return mergeWithDiffImpl(Other, copyOrRef(WithNewElem), &IntoDiff); + } + + bool mergeWithDiff(const LLVMRawAliasSet &Other, + std::invocable auto WithNewElem) { + return mergeWithDiffImpl(Other, copyOrRef(WithNewElem), nullptr); + } + private: + bool mergeWithDiffImpl(const LLVMRawAliasSet &Other, + std::invocable auto WithNewElem, + LLVMRawAliasSet *IntoDiff) { + auto Diff = Other.Bits - Bits; + if (Diff.empty()) { + return false; + } + + Bits |= Diff; + if (IntoDiff) { + IntoDiff->Bits |= Diff; + } + for (auto Elem : Diff) { + std::invoke(WithNewElem, IdT(Elem)); + } + return true; + } + llvm::SparseBitVector<> Bits; - // TODO: roaring::Roaring Bits; }; + +template class RoaringAliasSet { +public: + using value_type = IdT; + + RoaringAliasSet() = default; + + void insert(IdT Id) { Bits.add(uint32_t(Id)); } + + [[nodiscard]] bool tryInsert(IdT Id) { return Bits.addChecked(uint32_t(Id)); } + + [[nodiscard]] bool contains(IdT Id) const { + return Bits.contains(uint32_t(Id)); + } + + template HandlerFn> + LLVM_ATTRIBUTE_ALWAYS_INLINE void foreach (HandlerFn Handler) const { + return Bits.iterate( + [](uint32_t Id, void *HandlerPtr) { + auto &Handler = *(HandlerFn *)HandlerPtr; + if constexpr (std::convertible_to< + std::invoke_result_t, bool>) { + if (!std::invoke(Handler, IdT(Id))) { + return false; + } + } else { + std::invoke(Handler, IdT(Id)); + } + return true; + }, + &Handler); + } + + void operator|=(const RoaringAliasSet &Other) { Bits |= Other.Bits; } + void operator&=(const RoaringAliasSet &Other) { Bits &= Other.Bits; } + void operator-=(const RoaringAliasSet &Other) { Bits -= Other.Bits; } + [[nodiscard]] RoaringAliasSet operator-(const RoaringAliasSet &Other) const { + return Bits - Other.Bits; + } + + [[nodiscard]] bool empty() const noexcept { return Bits.isEmpty(); } + [[nodiscard]] size_t size() const noexcept { return Bits.cardinality(); } + + void clear() noexcept { Bits.clear(); } + + [[nodiscard]] auto begin() const noexcept { return Bits.begin(); } + [[nodiscard]] auto end() const noexcept { return Bits.end(); } + + [[nodiscard]] bool tryMergeWith(const RoaringAliasSet &Other) { + auto OldSz = size(); + Bits |= Other.Bits; + return size() != OldSz; + } + + void erase(IdT Id) { Bits.remove(uint32_t(Id)); } + + // Bulk-inserts from a sorted, deduplicated array. + // Roaring constructs containers in O(N) for sorted input. + void insertSorted(llvm::ArrayRef Sorted) { + Bits.addMany(Sorted.size(), Sorted.data()); + } + + [[nodiscard]] bool operator==(const RoaringAliasSet &Other) const noexcept { + return Bits == Other.Bits; + } + + bool mergeWithDiff(const RoaringAliasSet &Other, + std::invocable auto WithNewElem) { + constexpr size_t DiffThreshold = 16; + // operator- is expensive, but it is definitely a lot faster than the + // foreach loop if UPending is large + + if (Other.size() > DiffThreshold) { + RoaringAliasSet Diff = Other - *this; + if (Diff.empty()) { + return false; + } + + *this |= Diff; + + Diff.foreach (copyOrRef(WithNewElem)); + return true; + } + + bool Ret = false; + Other.foreach ([&](IdT Elem) { + if (tryInsert(Elem)) { + std::invoke(WithNewElem, Elem); + Ret = true; + } + }); + return Ret; + } + + bool mergeWithDiff(const RoaringAliasSet &Other, + std::invocable auto WithNewElem, + RoaringAliasSet &IntoDiff) { + constexpr size_t DiffThreshold = 16; + // operator- is expensive, but it is definitely a lot faster than the + // foreach loop if Other is large + + if (Other.size() > DiffThreshold) { + RoaringAliasSet Diff = Other - *this; + if (Diff.empty()) { + return false; + } + + *this |= Diff; + IntoDiff |= Diff; + + Diff.foreach (copyOrRef(WithNewElem)); + return true; + } + + bool Ret = false; + Other.foreach ([&](IdT Elem) { + if (tryInsert(Elem)) { + IntoDiff.insert(Elem); + std::invoke(WithNewElem, Elem); + Ret = true; + } + }); + + return Ret; + } + +private: + RoaringAliasSet(roaring::Roaring &&RR) : Bits(std::move(RR)) {} + + roaring::Roaring Bits{}; +}; + +template using RawAliasSet = RoaringAliasSet; + } // namespace psr diff --git a/lib/PhasarLLVM/ControlFlow/ControlFlow.cppm b/lib/PhasarLLVM/ControlFlow/ControlFlow.cppm index 629bd45daf..fcba61a7e5 100644 --- a/lib/PhasarLLVM/ControlFlow/ControlFlow.cppm +++ b/lib/PhasarLLVM/ControlFlow/ControlFlow.cppm @@ -26,6 +26,7 @@ using psr::getEntryFunctionsMut; using psr::getNonPureVirtualVFTEntry; using psr::getReceiverType; using psr::getReceiverTypeName; +using psr::getStructVCallInfo; using psr::getVFTIndex; using psr::GlobalCtorsDtorsModel; using psr::ICFGBase; diff --git a/lib/PhasarLLVM/ControlFlow/Resolver/Resolver.cpp b/lib/PhasarLLVM/ControlFlow/Resolver/Resolver.cpp index 03489a592b..c4a59d60ab 100644 --- a/lib/PhasarLLVM/ControlFlow/Resolver/Resolver.cpp +++ b/lib/PhasarLLVM/ControlFlow/Resolver/Resolver.cpp @@ -38,57 +38,17 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Operator.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include #include +#include using namespace psr; -std::optional psr::getVFTIndex(const llvm::CallBase *CallSite) { - // deal with a virtual member function - // retrieve the vtable entry that is called - const auto *Load = - llvm::dyn_cast(CallSite->getCalledOperand()); - if (Load == nullptr) { - return std::nullopt; - } - const auto *GEP = - llvm::dyn_cast(Load->getPointerOperand()); - if (GEP == nullptr) { - return std::nullopt; - } - if (auto *CI = llvm::dyn_cast(GEP->getOperand(1))) { - return CI->getZExtValue(); - } - return std::nullopt; -} - -std::optional> -psr::getVFTIndexAndVT(const llvm::CallBase *CallSite) { - // deal with a virtual member function - // retrieve the vtable entry that is called - const auto *Load = - llvm::dyn_cast(CallSite->getCalledOperand()); - if (Load == nullptr) { - return std::nullopt; - } - - const auto *GEP = - llvm::dyn_cast(Load->getPointerOperand()); - if (GEP == nullptr) { - return std::nullopt; - } - - if (auto *CI = llvm::dyn_cast(GEP->getOperand(1))) { - return {{GEP->getPointerOperand(), CI->getZExtValue()}}; - } - - return std::nullopt; -} - const llvm::DIType *psr::getReceiverType(const llvm::CallBase *CallSite) { if (!CallSite || CallSite->arg_empty() || (CallSite->hasStructRetAttr() && CallSite->arg_size() < 2)) { @@ -139,69 +99,6 @@ std::string psr::getReceiverTypeName(const llvm::CallBase *CallSite) { return ""; } -bool psr::isConsistentCall(const llvm::CallBase *CallSite, - const llvm::Function *DestFun) { - if (CallSite->arg_size() < DestFun->arg_size()) { - return false; - } - if (CallSite->arg_size() != DestFun->arg_size() && !DestFun->isVarArg()) { - return false; - } - - for (const auto &[Param, ArgOp] : - llvm::zip_first(DestFun->args(), CallSite->args())) { - - const auto *ParamTy = Param.getType(); - const auto *ArgTy = ArgOp->getType(); - - if (ParamTy == ArgTy) { - // Trivial equality - continue; - } - - if (ParamTy->getTypeID() != ArgTy->getTypeID()) { - // Trivial non-equality, e.g. PointerType and IntegerType - return false; - } - - if (ParamTy->isPointerTy()) { - if (Param.hasByValAttr() != - CallSite->isByValArgument(ArgOp.getOperandNo())) { - return false; - } - - const auto *ParamSRetTy = Param.getParamStructRetType(); - const auto *ArgSRetTy = - CallSite->getParamStructRetType(ArgOp.getOperandNo()); - if ((ParamSRetTy != nullptr) != (ArgSRetTy != nullptr)) { - return false; - } - - if (ParamSRetTy && ArgSRetTy) { - // TODO: For better precision, compare the sret types as well - // Trivial non-equality, e.g. PointerType and IntegerType - if (ParamSRetTy->getTypeID() != ArgSRetTy->getTypeID()) { - // Trivial non-equality, e.g. PointerType and IntegerType - return false; - } - } - } - - if (ParamTy->isStructTy()) { - // Copied comment from struct-case in isTypeMatchForFunctionArgument(): - // > Well, we could do sanity checks here, but if the analysed code is - // > insane we would miss callees, so we don't do that. - - continue; - } - - // Types are non-equal and we could not find a reason to treat the same - return false; - } - - return true; -} - bool psr::isVirtualCall(const llvm::Instruction *Inst, const LLVMVFTableProvider &VTP) { assert(Inst != nullptr); diff --git a/lib/PhasarLLVM/Pointer/AndersenOTFAA.cpp b/lib/PhasarLLVM/Pointer/AndersenOTFAA.cpp new file mode 100644 index 0000000000..b8f9118e43 --- /dev/null +++ b/lib/PhasarLLVM/Pointer/AndersenOTFAA.cpp @@ -0,0 +1,1188 @@ +/****************************************************************************** + * Copyright (c) 2026 Fabian Schiebel. + * All rights reserved. This program and the accompanying materials are made + * available under the terms of LICENSE.txt. + * + * Contributors: + * Fabian Schiebel and others + *****************************************************************************/ + +#include "phasar/PhasarLLVM/Pointer/AndersenOTFAA.h" + +#include "phasar/PhasarLLVM/DB/LLVMProjectIRDB.h" +#include "phasar/PhasarLLVM/Pointer/LLVMGlobalInitCache.h" +#include "phasar/PhasarLLVM/Pointer/LLVMPointerAssignmentGraph.h" +#include "phasar/PhasarLLVM/Pointer/MemSSAUtils.h" +#include "phasar/PhasarLLVM/TypeHierarchy/DIBasedTypeHierarchy.h" +#include "phasar/PhasarLLVM/TypeHierarchy/LLVMVFTable.h" +#include "phasar/PhasarLLVM/Utils/LLVMFunctionDataFlowFacts.h" +#include "phasar/PhasarLLVM/Utils/LLVMShorthands.h" +#include "phasar/PhasarLLVM/Utils/VirtualCallUtils.h" +#include "phasar/Utils/IotaIterator.h" +#include "phasar/Utils/LibCSummary.h" +#include "phasar/Utils/LibrarySummary.h" +#include "phasar/Utils/Soundness.h" +#include "phasar/Utils/UnionFind.h" +#include "phasar/Utils/Utilities.h" +#include "phasar/Utils/ValueCompressor.h" + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/PointerIntPair.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" + +#include +#include +#include + +using namespace psr; + +namespace { +/// File-local wrapper: extends PAGVariable with a variable/object flag. +/// Variable nodes (IsObject=false) represent SSA pointer values. +/// Object nodes (IsObject=true) represent abstract memory cells. +class AndersenVar { +public: + AndersenVar() noexcept = default; + AndersenVar(PAGVariable Base, bool IsObject) : Base(Base, IsObject) {} + + [[nodiscard]] PAGVariable getBase() const noexcept { + return Base.getPointer(); + } + [[nodiscard]] bool isObject() const noexcept { return Base.getInt(); } + + friend bool operator==(AndersenVar A, AndersenVar B) noexcept { + return A.Base == B.Base; + } + + friend auto hash_value(AndersenVar V) noexcept { + return llvm::hash_value(V.Base.getOpaqueValue()); + } + +private: + llvm::PointerIntPair Base{}; +}; +} // namespace + +namespace llvm { +template <> struct DenseMapInfo { + static AndersenVar getEmptyKey() noexcept { + return {DenseMapInfo::getEmptyKey(), false}; + } + static AndersenVar getTombstoneKey() noexcept { + return {DenseMapInfo::getTombstoneKey(), false}; + } + static unsigned getHashValue(AndersenVar V) noexcept { return hash_value(V); } + static bool isEqual(AndersenVar A, AndersenVar B) noexcept { return A == B; } +}; +} // namespace llvm + +struct [[clang::internal_linkage]] AndersenOTFSolver::SolverData { + // ---- Per-node state ------------------------------------------------- + + struct NodeInfo { + RawAliasSet PtsSet; + RawAliasSet PendingPts; + // Assignment edges: pts(this) ⊆ pts(dst) for each dst. + llvm::SmallVector AssignDsts; + llvm::SmallDenseSet AssignDstSet; // dedup guard + // Load constraints: dst = *this. + llvm::SmallVector LoadDsts; + llvm::SmallDenseSet LoadDstSet; // dedup guard + // Store constraints: *this = src. + llvm::SmallVector StoreSrcs; + llvm::SmallDenseSet StoreSrcSet; // dedup guard + // MemCopy: memcpy(dst_ptr, this=src_ptr). + llvm::SmallVector MemCopyAsSrc; + llvm::SmallDenseSet MemCopyAsSrcSet; // dedup guard + // MemCopy: memcpy(this=dst_ptr, src_ptr). + llvm::SmallVector MemCopyAsDst; + llvm::SmallDenseSet MemCopyAsDstSet; // dedup guard + }; + + // One set of ValueIds per call argument; empty means non-pointer. + using ArgList = llvm::SmallVector>; + + struct FPCallRecord { + const llvm::CallBase *CS; + ValueId FPId; + ArgList Args; + std::optional CSRetVal; + }; + + struct VCallRecord { + const llvm::CallBase *CS; + ValueId VtablePtrId; + uint64_t VtableIndex; + ArgList Args; + std::optional CSRetVal; + }; + + struct StructVCallRecord { + const llvm::CallBase *CS; + ValueId BaseId; // pts(BaseId) = struct objects + ValueId FPId; // pts(FPId) = fn objects (field-insensitive fallback) + llvm::SmallVector Indices; // all GEP indices + llvm::Type *GEPElemTy; // GEP source element type (for type check) + ArgList Args; + std::optional CSRetVal; + }; + + // ---- Data fields ---------------------------------------------------- + + const LLVMProjectIRDB &IRDB; // NOLINT + const llvm::DataLayout &DL; // NOLINT + ValueCompressor &ExternalVC; // NOLINT – caller-visible output + ValueCompressor LocalVC{}; // internal variable+object nodes + Soundness SoundnessFlag; + library_summary::LLVMFunctionDataFlowFacts LibFacts; + + llvm::TargetLibraryInfoWrapperPass TLA{}; + std::optional MSSABundle{}; + llvm::MemorySSA *CurrentMemSSA = nullptr; + + llvm::SmallVector FunctionWorklist; + llvm::DenseSet Queued; // ever pushed to worklist + llvm::DenseSet Processed; + + UnionFind SCCUf; + TypedVector Nodes; + + llvm::SmallVector UnresolvedFPCalls; + llvm::SmallVector UnresolvedVCalls; + llvm::SmallVector UnresolvedStructVCalls; + llvm::DenseMap> + ConnectedCallees; + CallGraphBuilder CGBuilder; + llvm::SmallVector PropWorklist; + + // ---- Constructor ---------------------------------------------------- + + SolverData(const LLVMProjectIRDB &IRDB, + llvm::ArrayRef Entries, + ValueCompressor &VC, Soundness S) + : IRDB(IRDB), DL(IRDB.getModule()->getDataLayout()), ExternalVC(VC), + SoundnessFlag(S), LibFacts(library_summary::readFromFDFF( + getLibCSummary(), [&IRDB](llvm::StringRef Name) { + return IRDB.getFunction(Name); + })) { + + CGBuilder.reserve(IRDB.getNumFunctions()); + for (const auto *F : Entries) { + if (Queued.insert(F).second) { + FunctionWorklist.push_back(F); + + // entry functions may be missed in the CG, if they are never called + // explicitly in the code + std::ignore = CGBuilder.addFunctionVertex(F); + } + } + } + + // ---- Node growth ---------------------------------------------------- + + void grow(ValueId V) { + const auto Idx = size_t(V); + if (Idx >= Nodes.size()) { + Nodes.resize(Idx + 1); + SCCUf.grow(Idx + 1); + } + } + + ValueId getOrInsertVar(PAGVariable Var) { + auto [Id, _] = LocalVC.insert(AndersenVar{Var, false}); + grow(Id); + return Id; + } + + ValueId getOrInsertObj(PAGVariable Var) { + auto [Id, _] = LocalVC.insert(AndersenVar{Var, true}); + grow(Id); + return Id; + } + + // pts(VarId) for global objects: functions self-point (the address IS + // the abstract object); global variables point to their object node. + void addGlobalPointee(const llvm::GlobalObject *GO, ValueId VarId) { + if (llvm::isa(GO)) { + addPointee(VarId, VarId); + } else if (const auto *GVar = llvm::dyn_cast(GO)) { + addPointee(VarId, getOrInsertObj(PAGVariable(GVar))); + } + } + + [[nodiscard]] ValueId rep(ValueId V) const { return SCCUf.find(V); } + + // Merges the SCCs containing A and B. Returns the new representative. + // Folds all pts/edges/constraints from the non-rep into the rep, then + // clears the non-rep's NodeInfo. All NonRep data is snapshotted before any + // addAssignEdge call to avoid reference invalidation via grow(). + ValueId merge(ValueId A, ValueId B) { + A = rep(A); + B = rep(B); + if (A == B) { + return A; + } + const ValueId Rep = SCCUf.join(A, B); + const ValueId NonRep = (Rep == A) ? B : A; + + // Snapshot all NonRep data before any addAssignEdge / grow calls that + // may reallocate Nodes and invalidate references. + auto NRAssignDsts = std::move(Nodes[NonRep].AssignDsts); + Nodes[NonRep].AssignDstSet.clear(); + const RawAliasSet NRPts = Nodes[NonRep].PtsSet; + auto NRLoadDsts = std::move(Nodes[NonRep].LoadDsts); + auto NRStoreSrcs = std::move(Nodes[NonRep].StoreSrcs); + auto NRMemCopyAsSrc = std::move(Nodes[NonRep].MemCopyAsSrc); + auto NRMemCopyAsDst = std::move(Nodes[NonRep].MemCopyAsDst); + + // Re-register NonRep's assign edges under Rep. + for (ValueId Dst : NRAssignDsts) { + const ValueId DstRep = rep(Dst); + if (DstRep != Rep) { + addAssignEdge(Rep, DstRep); + } + } + + // Merge pts sets. + Nodes[Rep].PtsSet.mergeWithDiff( + NRPts, [&](ValueId NewObj) { onNewPointee(Rep, NewObj); }, + Nodes[Rep].PendingPts); + + // Snapshot Rep's pts (after merge) for retroactive constraint firing. + const auto RepPts = Nodes[Rep].PtsSet; + + // Transfer NonRep's load constraints and retroactively fire them for + // Rep's existing pts members. + for (ValueId D : NRLoadDsts) { + if (Nodes[Rep].LoadDstSet.insert(D).second) { + Nodes[Rep].LoadDsts.push_back(D); + RepPts.foreach ([&](ValueId Obj) { addAssignEdge(Obj, D); }); + } + } + + // Transfer NonRep's store constraints with retroactive firing. + for (ValueId S : NRStoreSrcs) { + if (Nodes[Rep].StoreSrcSet.insert(S).second) { + Nodes[Rep].StoreSrcs.push_back(S); + RepPts.foreach ([&](ValueId Obj) { addAssignEdge(S, Obj); }); + } + } + + // Transfer NonRep's memcpy-as-src constraints with retroactive firing. + for (ValueId D : NRMemCopyAsSrc) { + if (Nodes[Rep].MemCopyAsSrcSet.insert(D).second) { + Nodes[Rep].MemCopyAsSrc.push_back(D); + if (Nodes.inbounds(D)) { + const auto &DstPts = Nodes[D].PtsSet; + RepPts.foreach ([&](ValueId O1) { + DstPts.foreach ([&](ValueId O2) { addAssignEdge(O1, O2); }); + }); + } + } + } + + // Transfer NonRep's memcpy-as-dst constraints with retroactive firing. + for (ValueId S : NRMemCopyAsDst) { + if (Nodes[Rep].MemCopyAsDstSet.insert(S).second) { + Nodes[Rep].MemCopyAsDst.push_back(S); + if (Nodes.inbounds(S)) { + const auto &SrcPts = Nodes[S].PtsSet; + SrcPts.foreach ([&](ValueId O1) { + RepPts.foreach ([&](ValueId O2) { addAssignEdge(O1, O2); }); + }); + } + } + } + + Nodes[NonRep] = NodeInfo{}; + return Rep; + } + + // ---- Operand traversal ---------------------------------------------- + + void forEachOpId(const llvm::Value *V, std::invocable auto Handler) { + const llvm::Value *Stripped = V->stripPointerCastsAndAliases(); + if (definitelyContainsNoPointer(Stripped)) { + return; + } + psr::forEachPointerOperand( + Stripped, [this, Handler = copyOrRef(Handler)](const llvm::Value *Op) { + const ValueId VId = getOrInsertVar(PAGVariable(Op)); + if (const auto *GO = llvm::dyn_cast(Op)) { + addGlobalPointee(GO, VId); + } + std::invoke(Handler, VId); + }); + } + + // ---- Constraint insertion ------------------------------------------- + // + // INVARIANT: every method resolves all ids through rep() first, then calls + // grow() for all ids before accessing Nodes by reference. Any grow() call + // may reallocate the Nodes backing array, so no NodeInfo& must be held + // across a grow() call. addAssignEdge does not call grow(), so references + // into Nodes remain valid across it. + + void addPointee(ValueId Ptr, ValueId Obj) { + Ptr = rep(Ptr); + Obj = rep(Obj); + grow(Ptr); + grow(Obj); // grow before indexing Nodes[Ptr] + if (Nodes[Ptr].PtsSet.tryInsert(Obj)) { + Nodes[Ptr].PendingPts.insert(Obj); + PropWorklist.push_back(Ptr); + } + } + + void addAssignEdge(ValueId Src, ValueId Dst) { + Src = rep(Src); + Dst = rep(Dst); + if (Src == Dst) { + return; + } + + if (!Nodes.inbounds(Src) || !Nodes.inbounds(Dst)) [[unlikely]] { + llvm::report_fatal_error( + "Connecting nodes which are not allocated yet. Node allocation " + "should happen through getOrInsertVar or getOrInsertObj"); + } + + if (Nodes[Src].AssignDstSet.insert(Dst).second) { + Nodes[Src].AssignDsts.push_back(Dst); + if (!Nodes[Src].PtsSet.empty()) { + // New edge: Dst has never seen Src's pts history, so mark all of + // Src's current pts as pending (not just the incremental delta). + Nodes[Src].PendingPts |= Nodes[Src].PtsSet; + PropWorklist.push_back(Src); + } + } + } + + void addLoad(ValueId Ptr, ValueId Dst) { + Ptr = rep(Ptr); + Dst = rep(Dst); + grow(Ptr); + grow(Dst); + const auto &ExistingPts = Nodes[Ptr].PtsSet; + ExistingPts.foreach ([&](ValueId Obj) { addAssignEdge(Obj, Dst); }); + if (Nodes[Ptr].LoadDstSet.insert(Dst).second) { + Nodes[Ptr].LoadDsts.push_back(Dst); + } + } + + void addStore(ValueId Ptr, ValueId Src) { + Ptr = rep(Ptr); + Src = rep(Src); + grow(Ptr); + grow(Src); + const auto &ExistingPts = Nodes[Ptr].PtsSet; + ExistingPts.foreach ([&](ValueId Obj) { addAssignEdge(Src, Obj); }); + if (Nodes[Ptr].StoreSrcSet.insert(Src).second) { + Nodes[Ptr].StoreSrcs.push_back(Src); + } + } + + void addMemCopy(ValueId SrcPtr, ValueId DstPtr) { + SrcPtr = rep(SrcPtr); + DstPtr = rep(DstPtr); + grow(SrcPtr); + grow(DstPtr); + const auto &SrcPts = Nodes[SrcPtr].PtsSet; + const auto &DstPts = Nodes[DstPtr].PtsSet; + SrcPts.foreach ([&](ValueId O1) { + DstPts.foreach ([&](ValueId O2) { addAssignEdge(O1, O2); }); + }); + if (Nodes[SrcPtr].MemCopyAsSrcSet.insert(DstPtr).second) { + Nodes[SrcPtr].MemCopyAsSrc.push_back(DstPtr); + } + if (Nodes[DstPtr].MemCopyAsDstSet.insert(SrcPtr).second) { + Nodes[DstPtr].MemCopyAsDst.push_back(SrcPtr); + } + } + + // ---- Propagation ---------------------------------------------------- + + void onNewPointee(ValueId PtrRep, ValueId NewObj) { + assert(Nodes.inbounds(PtrRep)); + const auto &LoadDsts = Nodes[PtrRep].LoadDsts; + const auto &StoreSrcs = Nodes[PtrRep].StoreSrcs; + const auto &MemSrcs = Nodes[PtrRep].MemCopyAsSrc; + const auto &MemDsts = Nodes[PtrRep].MemCopyAsDst; + + for (ValueId Dst : LoadDsts) { + addAssignEdge(NewObj, Dst); + } + for (ValueId Src : StoreSrcs) { + addAssignEdge(Src, NewObj); + } + for (ValueId DstPtr : MemSrcs) { + if (!Nodes.inbounds(DstPtr)) { + continue; + } + const auto &DstPts = Nodes[DstPtr].PtsSet; + DstPts.foreach ([&](ValueId O2) { addAssignEdge(NewObj, O2); }); + } + for (ValueId SrcPtr : MemDsts) { + if (!Nodes.inbounds(SrcPtr)) { + continue; + } + const auto &SrcPts = Nodes[SrcPtr].PtsSet; + SrcPts.foreach ([&](ValueId O1) { addAssignEdge(O1, NewObj); }); + } + } + + void propagate() { + while (!PropWorklist.empty()) { + ValueId U = rep(PropWorklist.pop_back_val()); + if (!Nodes.inbounds(U) || Nodes[U].PendingPts.empty()) { + continue; + } + + // Snapshot resolved successors: merge() can modify Nodes[U].AssignDsts. + llvm::SmallVector Dsts; + for (ValueId V : Nodes[U].AssignDsts) { + Dsts.push_back(rep(V)); + } + + // Drain before iterating Dsts: addAssignEdge inside onNewPointee/merge() + // may write to Nodes[U].PendingPts while we iterate. + RawAliasSet UPending = std::exchange(Nodes[U].PendingPts, {}); + + for (ValueId VSnap : Dsts) { + // Re-resolve: a prior iteration's merge() may have changed the rep. + const ValueId V = rep(VSnap); + if (V == U || !Nodes.inbounds(V)) { + continue; + } + + const bool AddedAny = Nodes[V].PtsSet.mergeWithDiff( + UPending, [this, V](ValueId Obj) { onNewPointee(V, Obj); }, + Nodes[V].PendingPts); + + if (!AddedAny) { + // LCD: V has all of U's pending wave, so V.PtsSet ⊇ U.PtsSet. + if (Nodes[V].AssignDstSet.contains(U)) { + U = merge(U, V); + } + continue; + } + PropWorklist.push_back(V); + } + } + } + + // ---- IR translation ------------------------------------------------- + + void initGlobals() { + GlobalInitCache GCache; + for (const auto &G : IRDB.getModule()->globals()) { + if (definitelyContainsNoPointer(G.getValueType())) { + continue; + } + const ValueId VarId = getOrInsertVar(PAGVariable(&G)); + const ValueId ObjId = getOrInsertObj(PAGVariable(&G)); + addPointee(VarId, ObjId); + if (!G.hasInitializer()) { + continue; + } + for (ValueId SrcId : + GCache.getOrCreate(G.getInitializer(), [&](const llvm::Value *V) { + const ValueId VId = getOrInsertVar(PAGVariable(V)); + if (const auto *GO = llvm::dyn_cast(V)) { + addGlobalPointee(GO, VId); + } + return VId; + })) { + addStore(VarId, SrcId); + } + } + propagate(); + } + + void processFunction(const llvm::Function *F) { + MSSABundle.emplace(const_cast(*F), &TLA.getTLI(*F)); + CurrentMemSSA = &MSSABundle->MSSA; + for (const auto &Arg : F->args()) { + if (!definitelyContainsNoPointer(&Arg)) { + (void)getOrInsertVar(PAGVariable(&Arg)); + } + } + for (const auto &I : llvm::instructions(F)) { + processInstruction(I); + } + } + + void addPtrAlias(const llvm::Value *V, const llvm::Value *Src) { + forEachOpId(Src, [&](ValueId OpId) { + LocalVC.addAlias(AndersenVar{PAGVariable(V), false}, OpId); + grow(OpId); + }); + } + + void processInstruction(const llvm::Instruction &I) { + if (const auto *Alloca = llvm::dyn_cast(&I)) { + const ValueId VarId = getOrInsertVar(PAGVariable(Alloca)); + const ValueId ObjId = getOrInsertObj(PAGVariable(Alloca)); + addPointee(VarId, ObjId); + return; + } + if (const auto *S = llvm::dyn_cast(&I)) { + handleStore(S); + return; + } + if (const auto *L = llvm::dyn_cast(&I)) { + handleLoad(L); + return; + } + if (const auto *M = llvm::dyn_cast(&I)) { + handleMemTransfer(M); + return; + } + if (const auto *C = llvm::dyn_cast(&I)) { + handleCall(C); + return; + } + if (const auto *R = llvm::dyn_cast(&I)) { + handleReturn(R); + return; + } + if (const auto *P = llvm::dyn_cast(&I)) { + handlePhi(P); + return; + } + if (const auto *S = llvm::dyn_cast(&I)) { + handleSelect(S); + return; + } + + // Casts: alias result to stripped operand (field-insensitive). + if (const auto *Cast = llvm::dyn_cast(&I)) { + if (!definitelyContainsNoPointer(Cast)) { + addPtrAlias(Cast, Cast->getOperand(0)); + } + return; + } + + // GEPs: alias result to base pointer (field-insensitive). + if (const auto *GEP = llvm::dyn_cast(&I)) { + addPtrAlias(GEP, GEP->getPointerOperand()); + } + } + + void handleStore(const llvm::StoreInst *S) { + if (definitelyContainsNoPointer(S->getValueOperand())) { + return; + } + forEachOpId(S->getPointerOperand(), [&](ValueId PtrId) { + forEachOpId(S->getValueOperand(), + [&](ValueId ValId) { addStore(PtrId, ValId); }); + }); + } + + void handleLoad(const llvm::LoadInst *L) { + if (definitelyContainsNoPointer(L)) { + return; + } + if (CurrentMemSSA) { + llvm::SmallPtrSet Defs; + const bool HasLiveOnEntry = collectReachingDefs(L, *CurrentMemSSA, Defs); + if (!HasLiveOnEntry) { + if (Defs.size() == 1) { + const auto *ValueOp = (*Defs.begin())->getValueOperand(); + if (!llvm::isa(ValueOp) && + !definitelyContainsNoPointer(ValueOp)) { + addPtrAlias(L, ValueOp); + return; + } + // Non-pointer or ConstantExpr store value: fall through to addLoad. + } else { + const ValueId DstId = getOrInsertVar(PAGVariable(L)); + bool AnyEdge = false; + for (const auto *Def : Defs) { + forEachOpId(Def->getValueOperand(), [&](ValueId SrcId) { + addAssignEdge(SrcId, DstId); + AnyEdge = true; + }); + } + if (AnyEdge) { + return; + } + // All reaching stores have non-pointer value operands: + // fall through to addLoad. + } + } + } + const ValueId DstId = getOrInsertVar(PAGVariable(L)); + forEachOpId(L->getPointerOperand(), + [&](ValueId PtrId) { addLoad(PtrId, DstId); }); + } + + void handleMemTransfer(const llvm::MemTransferInst *M) { + forEachOpId(M->getDest(), [&](ValueId DstPtr) { + forEachOpId(M->getSource(), + [&](ValueId SrcPtr) { addMemCopy(SrcPtr, DstPtr); }); + }); + } + + void handlePhi(const llvm::PHINode *P) { + if (definitelyContainsNoPointer(P)) { + return; + } + const ValueId PhiId = getOrInsertVar(PAGVariable(P)); + for (const auto &Inc : P->incoming_values()) { + if (definitelyContainsNoPointer(Inc.get())) { + continue; + } + forEachOpId(Inc.get(), + [&](ValueId IncId) { addAssignEdge(IncId, PhiId); }); + } + } + + void handleSelect(const llvm::SelectInst *S) { + if (definitelyContainsNoPointer(S)) { + return; + } + const ValueId SelId = getOrInsertVar(PAGVariable(S)); + const auto *TV = S->getTrueValue(); + const auto *FV = S->getFalseValue(); + if (!definitelyContainsNoPointer(TV)) { + forEachOpId(TV, [&](ValueId Id) { addAssignEdge(Id, SelId); }); + } + if (!definitelyContainsNoPointer(FV)) { + forEachOpId(FV, [&](ValueId Id) { addAssignEdge(Id, SelId); }); + } + } + + void handleReturn(const llvm::ReturnInst *R) { + const auto *RetVal = R->getReturnValue(); + if (!RetVal || definitelyContainsNoPointer(RetVal)) { + return; + } + const ValueId RetSlotId = + getOrInsertVar(PAGVariable::Return{R->getFunction()}); + forEachOpId(RetVal, + [&](ValueId ValId) { addAssignEdge(ValId, RetSlotId); }); + } + + // ---- Call-graph co-refinement --------------------------------------- + + // For each argument, add every function in pts(ArgId) to the worklist + // as an entry point. Used when a callee is a declaration and we want to + // treat fn-ptr arguments as reachable callbacks (Soundy / Sound mode). + void + addFnPtrArgsAsEntries(llvm::ArrayRef> Args) { + for (const auto &ArgIds : Args) { + for (ValueId ArgId : ArgIds) { + ArgId = rep(ArgId); + if (!Nodes.inbounds(ArgId)) { + continue; + } + Nodes[ArgId].PtsSet.foreach ([&](ValueId ObjId) { + if (!Nodes.inbounds(ObjId)) { + return false; + } + for (const auto &Var : LocalVC.id2vars(ObjId)) { + const auto *Fun = llvm::dyn_cast_or_null( + Var.getBase().valueOrNull()); + if (Fun && !Fun->isDeclaration() && Queued.insert(Fun).second) { + FunctionWorklist.push_back(Fun); + std::ignore = CGBuilder.addFunctionVertex(Fun); + } + } + return true; + }); + } + } + } + + void applyLibrarySummary( + const library_summary::LLVMFunctionDataFlowFacts::ParameterMappingTy + &LibSum, + const llvm::Function *Fun, + llvm::ArrayRef> Args, + std::optional CSRetVal) { + const size_t NumParams = Fun->arg_size(); + for (const auto &[ParamIdx, Dests] : LibSum) { + if (ParamIdx >= NumParams || ParamIdx >= Args.size() || + !Fun->getArg(ParamIdx)->getType()->isPointerTy()) { + continue; + } + for (const auto &DestFact : Dests) { + if (const auto *DestParam = + DestFact.dyn_cast()) { + if (DestParam->Index >= Args.size()) { + continue; + } + for (ValueId DstId : Args[DestParam->Index]) { + for (ValueId SrcId : Args[ParamIdx]) { + addStore(DstId, SrcId); + } + } + } else { + if (!CSRetVal) { + continue; + } + for (ValueId SrcId : Args[ParamIdx]) { + addAssignEdge(SrcId, *CSRetVal); + } + } + } + } + } + + bool connectCallee(const llvm::CallBase *CS, const llvm::Function *Callee, + llvm::ArrayRef> Args, + std::optional CSRetVal) { + const ValueId CalleeId = getOrInsertVar(PAGVariable(Callee)); + if (!ConnectedCallees[CS].insert(CalleeId).second) { + return false; + } + CGBuilder.addCallEdge(CS, Callee); + + if (Callee->isDeclaration()) { + if (const auto *LibSum = LibFacts.getFactsForFunctionOrNull(Callee)) { + applyLibrarySummary(*LibSum, Callee, Args, CSRetVal); + return false; + } + if (SoundnessFlag != Soundness::Unsound) { + addFnPtrArgsAsEntries(Args); + } + return false; + } + + if (Queued.insert(Callee).second) { + FunctionWorklist.push_back(Callee); + } + + if (CSRetVal && !Callee->getReturnType()->isVoidTy()) { + const ValueId RetSlotId = getOrInsertVar(PAGVariable::Return{Callee}); + addAssignEdge(RetSlotId, *CSRetVal); + } + + for (const auto &[Param, ArgIds] : llvm::zip(Callee->args(), Args)) { + if (ArgIds.empty() || definitelyContainsNoPointer(&Param)) { + continue; + } + const ValueId ParamId = getOrInsertVar(PAGVariable(&Param)); + for (ValueId ArgId : ArgIds) { + addAssignEdge(ArgId, ParamId); + } + } + + propagate(); + return true; + } + + bool resolveVtableCall(const llvm::CallBase *CS, ValueId VtablePtrId, + uint64_t VtableIndex, const ArgList &Args, + std::optional CSRetVal) { + VtablePtrId = rep(VtablePtrId); + if (!Nodes.inbounds(VtablePtrId)) { + llvm::report_fatal_error("Invalid Vtable Id #" + + llvm::Twine(uint32_t(VtablePtrId))); + } + bool NewEdge = false; + // Snapshot: connectCallee→propagate() may grow pts(VtablePtrId). + const RawAliasSet VPPts = Nodes[VtablePtrId].PtsSet; + VPPts.foreach ([&](ValueId ObjId) { + if (!Nodes.inbounds(ObjId)) { + return false; + } + for (const auto &Var : LocalVC.id2vars(ObjId)) { + const auto *GV = llvm::dyn_cast_or_null( + Var.getBase().valueOrNull()); + if (!GV || !GV->hasName() || + !GV->getName().starts_with(DIBasedTypeHierarchy::VTablePrefix) || + !GV->hasInitializer()) { + continue; + } + const auto *VTStruct = + llvm::dyn_cast(GV->getInitializer()); + if (!VTStruct) { + continue; + } + auto VFs = LLVMVFTable::getVFVectorFromIRVTable(*VTStruct); + if (VtableIndex >= VFs.size()) { + continue; + } + const auto *Callee = VFs[VtableIndex]; + if (!Callee || !isConsistentCall(CS, Callee)) { + continue; + } + NewEdge |= connectCallee(CS, Callee, Args, CSRetVal); + } + return true; + }); + return NewEdge; + } + + bool resolveStructVCall(const StructVCallRecord &Rec) { + const ValueId BaseId = rep(Rec.BaseId); + if (!Nodes.inbounds(BaseId)) { + llvm::report_fatal_error("Invalid BaseId in resolveStructVCall"); + } + bool NewEdge = false; + bool NeedFPFallback = false; + // Snapshot: connectCallee->propagate() may grow pts(BaseId). + const RawAliasSet BasePts = Nodes[BaseId].PtsSet; + BasePts.foreach ([&](ValueId ObjId) { + if (!Nodes.inbounds(ObjId)) { + return false; + } + for (const auto &Var : LocalVC.id2vars(ObjId)) { + // Resolve GlobalAlias to the underlying GlobalVariable. + const llvm::Value *Val = Var.getBase().valueOrNull(); + if (const auto *GA = llvm::dyn_cast_or_null(Val)) { + Val = GA->getAliaseeObject(); + } + const auto *GV = llvm::dyn_cast_or_null(Val); + if (!GV || !GV->isConstant() || !GV->hasInitializer()) { + NeedFPFallback = true; + continue; + } + // Type check: GV must be of GEPElemTy or [N x GEPElemTy]. + // Field-insensitive aliasing can put wrong-type objects in pts. + llvm::Type *const GVTy = GV->getValueType(); + if (GVTy != Rec.GEPElemTy) { + const auto *ArrTy = llvm::dyn_cast(GVTy); + if (!ArrTy || ArrTy->getElementType() != Rec.GEPElemTy) { + NeedFPFallback = true; + continue; + } + } + const auto *Callee = + walkConstInitPath(GV->getInitializer(), Rec.Indices); + if (!Callee || !isConsistentCall(Rec.CS, Callee)) { + continue; + } + NewEdge |= connectCallee(Rec.CS, Callee, Rec.Args, Rec.CSRetVal); + } + return true; + }); + if (NeedFPFallback) { + NewEdge |= resolveFPCall(Rec.CS, Rec.FPId, Rec.Args, Rec.CSRetVal); + } + return NewEdge; + } + + bool resolveFPCall(const llvm::CallBase *CS, ValueId FPId, + const ArgList &Args, std::optional CSRetVal) { + FPId = rep(FPId); + if (!Nodes.inbounds(FPId)) { + llvm::report_fatal_error("Invalid FPId"); + } + bool NewEdge = false; + // Snapshot pts(FPId): connectCallee→propagate() may grow pts(FPId). + const RawAliasSet FPPts = Nodes[FPId].PtsSet; + FPPts.foreach ([&](ValueId ObjId) { + if (!Nodes.inbounds(ObjId)) { + // Iteration is in sorted order + return false; + } + for (const auto &Var : LocalVC.id2vars(ObjId)) { + const auto *Fun = + llvm::dyn_cast_or_null(Var.getBase().valueOrNull()); + if (Fun && isConsistentCall(CS, Fun)) { + NewEdge |= connectCallee(CS, Fun, Args, CSRetVal); + } + } + return true; + }); + return NewEdge; + } + + void handleCall(const llvm::CallBase *C) { + if (C->isInlineAsm() || C->isDebugOrPseudoInst()) { + return; + } + + // Build one entry per call argument: empty inner vector = non-pointer. + ArgList Args; + for (const auto &Arg : C->args()) { + auto &ArgIds = Args.emplace_back(); + if (!definitelyContainsNoPointer(Arg.get())) { + forEachOpId(Arg.get(), [&](ValueId Id) { ArgIds.push_back(Id); }); + } + } + + std::optional CSRetVal; + if (C->getType()->isPointerTy()) { + const ValueId VarId = getOrInsertVar(PAGVariable(C)); + CSRetVal = VarId; + const auto *DirectCallee = llvm::dyn_cast( + C->getCalledOperand()->stripPointerCastsAndAliases()); + if (DirectCallee && + psr::isHeapAllocatingFunction(DirectCallee->getName())) { + const ValueId ObjId = getOrInsertObj(PAGVariable(C)); + addPointee(VarId, ObjId); + } + } + + const auto *FnPtr = C->getCalledOperand()->stripPointerCastsAndAliases(); + + if (const auto *Callee = llvm::dyn_cast(FnPtr)) { + connectCallee(C, Callee, Args, CSRetVal); + return; + } + + // Virtual call: read the concrete vtable at the specific slot index. + if (auto VCallInfo = getVFTIndexAndVT(C)) { + auto [VtablePtr, VtableIndex] = *VCallInfo; + const ValueId VtablePtrId = getOrInsertVar(PAGVariable(VtablePtr)); + resolveVtableCall(C, VtablePtrId, VtableIndex, Args, CSRetVal); + UnresolvedVCalls.push_back(VCallRecord{ + .CS = C, + .VtablePtrId = VtablePtrId, + .VtableIndex = VtableIndex, + .Args = std::move(Args), + .CSRetVal = CSRetVal, + }); + return; + } + + // Struct-field vtable call: call(load(GEP(base, const_indices...))) + // with a typed (>=3-operand) GEP. Resolve via global initializer for + // const globals; fall back to FP resolution for non-const objects. + if (auto SVInfo = getStructVCallInfo(C)) { + auto &[BasePtr, Indices, GEPElemTy] = *SVInfo; + const auto *Load = llvm::cast(C->getCalledOperand()); + const ValueId BaseId = getOrInsertVar(PAGVariable(BasePtr)); + const ValueId FPId = getOrInsertVar(PAGVariable(Load)); + StructVCallRecord Rec{ + .CS = C, + .BaseId = BaseId, + .FPId = FPId, + .Indices = std::move(Indices), + .GEPElemTy = GEPElemTy, + .Args = std::move(Args), + .CSRetVal = CSRetVal, + }; + resolveStructVCall(Rec); + UnresolvedStructVCalls.push_back(std::move(Rec)); + return; + } + + // Indirect call: connect already-known targets, record for fixpoint. + const ValueId FPId = getOrInsertVar(PAGVariable(FnPtr)); + resolveFPCall(C, FPId, Args, CSRetVal); + UnresolvedFPCalls.push_back(FPCallRecord{ + .CS = C, + .FPId = FPId, + .Args = std::move(Args), + .CSRetVal = CSRetVal, + }); + } + + bool checkUnresolvedFPCalls() { + bool NewEdge = false; + for (const auto &Rec : UnresolvedFPCalls) { + NewEdge |= resolveFPCall(Rec.CS, Rec.FPId, Rec.Args, Rec.CSRetVal); + } + return NewEdge; + } + + bool checkUnresolvedVCalls() { + bool NewEdge = false; + for (const auto &Rec : UnresolvedVCalls) { + NewEdge |= resolveVtableCall(Rec.CS, Rec.VtablePtrId, Rec.VtableIndex, + Rec.Args, Rec.CSRetVal); + } + return NewEdge; + } + + bool checkUnresolvedStructVCalls() { + bool NewEdge = false; + for (const auto &Rec : UnresolvedStructVCalls) { + NewEdge |= resolveStructVCall(Rec); + } + return NewEdge; + } + + // ---- Result construction -------------------------------------------- + + AndersenOTFResult buildResult() { + const size_t NumLocal = LocalVC.size(); + + // Map variable local IDs → external VC IDs. + // Object nodes are internal only and do not appear in the external result. + TypedVector> LocalToExt(NumLocal); + for (auto VId : iota(NumLocal)) { + std::optional FirstExtId; + for (const auto &V : LocalVC.id2vars(VId)) { + if (V.isObject()) { + continue; + } + if (!FirstExtId) { + FirstExtId = ExternalVC.insert(V.getBase()).first; + LocalToExt[VId] = FirstExtId; + } else { + ExternalVC.addAlias(V.getBase(), *FirstExtId); + } + } + } + + // Build rep → bitset of external IDs for all vars in that SCC. + TypedVector> RepToExtVIds(NumLocal); + for (auto VId : iota(NumLocal)) { + if (!LocalToExt[VId]) { + continue; + } + const ValueId RepId = rep(VId); + if (!Nodes.inbounds(RepId)) { + continue; + } + RepToExtVIds[RepId].push_back(*LocalToExt[VId]); + } + + // Reverse map: abstract object → bitset of representatives pointing to it. + // Only representatives with at least one external variable are inserted. + TypedVector> Obj2Reps(NumLocal); + for (auto RepId : iota(NumLocal)) { + if (RepToExtVIds[RepId].empty()) { + continue; + } + if (!Nodes.inbounds(RepId)) { + continue; + } + Nodes[RepId].PtsSet.foreach ([&](ValueId Obj) { + if (Obj2Reps.inbounds(Obj)) { + Obj2Reps[Obj].insert(RepId); + return true; + } + // Iteration is in sorted order + return false; + }); + } + + // Precompute per-object alias set: for each abstract object, the union of + // all external IDs of every representative that points to it. Built once + // here via sort+insertSorted so the main loop below can use fast |=. + TypedVector> ObjToAliasExtVIds(NumLocal); + { + llvm::SmallVector Buf; + for (const auto &[Obj, Reps] : Obj2Reps.enumerate()) { + if (Reps.empty()) { + continue; + } + Reps.foreach ([&](ValueId AliasRepId) { + for (auto EId : RepToExtVIds[AliasRepId]) { + Buf.push_back(uint32_t(EId)); + } + }); + std::ranges::sort(Buf); + ObjToAliasExtVIds[Obj].insertSorted(Buf); + Buf.clear(); + } + } + + AndersenOTFResult Result{}; + Result.AliasSets.resize(ExternalVC.size()); + + for (const auto &[RepId, ExtVIds] : RepToExtVIds.enumerate()) { + if (ExtVIds.empty()) { + continue; + } + if (!Nodes.inbounds(RepId)) { + break; // iota is monotone; all subsequent IDs exceed Nodes.size() + } + + // Union the pre-built per-object alias sets for all pointees. + RawAliasSet AliasExtVIds; + Nodes[RepId].PtsSet.foreach ([&](ValueId Obj) { + if (size_t(Obj) >= NumLocal) { + // Iteration is in sorted order + return false; + } + AliasExtVIds |= ObjToAliasExtVIds[Obj]; + return true; + }); + + // Broadcast to every external ID mapped to this representative. + for (auto ExtVId : ExtVIds) { + Result.AliasSets[ExtVId] |= AliasExtVIds; + } + } + + Result.CG = CGBuilder.consumeCallGraph(); + return Result; + } + + // ---- Main loop ------------------------------------------------------ + + AndersenOTFResult run() { + initGlobals(); + + bool Changed{}; + do { + while (!FunctionWorklist.empty()) { + const auto *F = FunctionWorklist.pop_back_val(); + if (!Processed.insert(F).second) { + continue; + } + processFunction(F); + // Drain pending pts for functions that make no pointer-relevant + // calls (connectCallee would otherwise be the only propagate site). + propagate(); + } + Changed = checkUnresolvedFPCalls(); + Changed |= checkUnresolvedVCalls(); + Changed |= checkUnresolvedStructVCalls(); + } while (!FunctionWorklist.empty() || Changed); + + return buildResult(); + } +}; + +// ---- AndersenOTFSolver -------------------------------------------------- + +AndersenOTFSolver::AndersenOTFSolver( + const LLVMProjectIRDB &IRDB, llvm::ArrayRef Entries, + ValueCompressor &VC, Soundness S) noexcept + : IRDB(IRDB), Entries(Entries), VC(VC), S(S) {} + +AndersenOTFResult AndersenOTFSolver::solve() { + SolverData Impl{*IRDB, Entries, *VC, S}; + return Impl.run(); +} + +// ---- Factory functions -------------------------------------------------- + +AndersenOTFResult +psr::computeAndersenOTFRaw(const LLVMProjectIRDB &IRDB, + llvm::ArrayRef EntryPoints, + MaybeUniquePtr> VC, + Soundness S) { + if (!VC) { + VC = std::make_unique>(); + } + AndersenOTFSolver Solver(IRDB, EntryPoints, *VC, S); + return Solver.solve(); +} + +LLVMUnionFindAliasIterator +psr::computeAndersenOTF(const LLVMProjectIRDB &IRDB, + llvm::ArrayRef EntryPoints, + MaybeUniquePtr> VC, + Soundness S) { + if (!VC) { + VC = std::make_unique>(); + } + AndersenOTFSolver Solver(IRDB, EntryPoints, *VC, S); + auto Res = Solver.solve(); + return LLVMUnionFindAliasIterator{std::move(Res), std::move(VC)}; +} diff --git a/lib/PhasarLLVM/Pointer/CMakeLists.txt b/lib/PhasarLLVM/Pointer/CMakeLists.txt index 1736a41820..bb01878905 100644 --- a/lib/PhasarLLVM/Pointer/CMakeLists.txt +++ b/lib/PhasarLLVM/Pointer/CMakeLists.txt @@ -9,6 +9,7 @@ add_phasar_library(phasar_llvm_pointer phasar_controlflow phasar_llvm_utils phasar_llvm_db + phasar_llvm_typehierarchy LLVM_LINK_COMPONENTS Core diff --git a/lib/PhasarLLVM/Pointer/LLVMPointerAssignmentGraph.cpp b/lib/PhasarLLVM/Pointer/LLVMPointerAssignmentGraph.cpp index 3776097c7e..d3b0ffae77 100644 --- a/lib/PhasarLLVM/Pointer/LLVMPointerAssignmentGraph.cpp +++ b/lib/PhasarLLVM/Pointer/LLVMPointerAssignmentGraph.cpp @@ -1,6 +1,8 @@ #include "phasar/PhasarLLVM/Pointer/LLVMPointerAssignmentGraph.h" #include "phasar/PhasarLLVM/DB/LLVMProjectIRDB.h" +#include "phasar/PhasarLLVM/Pointer/LLVMGlobalInitCache.h" +#include "phasar/PhasarLLVM/Pointer/MemSSAUtils.h" #include "phasar/PhasarLLVM/Utils/LLVMFunctionDataFlowFacts.h" #include "phasar/PhasarLLVM/Utils/LLVMShorthands.h" #include "phasar/Pointer/PointerAssignmentGraph.h" @@ -11,15 +13,11 @@ #include "phasar/Utils/ValueCompressor.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" @@ -44,68 +42,6 @@ std::string psr::to_string(PAGVariable Var) { namespace { -struct GlobalCache { - const llvm::DataLayout &DL; // NOLINT - // Due to the recursion in getOrCreateGCacheEntry, we need pointer stability - std::unordered_map> - Cache{}; - - [[nodiscard]] llvm::ArrayRef getOrCreateGCacheEntry( - LLVMPBStrategyRef Strategy, const llvm::Constant *Const, - std::invocable auto GetVariable) { - if (definitelyContainsNoPointer(Const)) { - return {}; - } - - auto [It, Inserted] = Cache.try_emplace(Const); - if (!Inserted) { - return It->second; - } - - auto &Vec = It->second; - - // We do not care about null here - if (llvm::isa(Const)) { - return {}; - } - - if (const auto *CGep = llvm::dyn_cast(Const)) { - // TODO: Properly handle constant GEPs - return getOrCreateGCacheEntry( - Strategy, llvm::cast(CGep->getPointerOperand()), - GetVariable); - } - - if (Const->getType()->isPointerTy()) { - Vec.push_back(GetVariable(Const, Strategy)); - - return Vec; - } - - // TODO: Get rid of the recursion - - if (const auto *Arr = llvm::dyn_cast(Const)) { - if (Arr->getType()->isArrayTy() && - definitelyContainsNoPointer(Arr->getType()->getArrayElementType())) { - return {}; - } - - size_t ArrayLen = Arr->getNumOperands(); - for (size_t I = 0; I < ArrayLen; ++I) { - auto *Elem = llvm::cast( - Arr->getAggregateElement(I)->stripPointerCastsAndAliases()); - auto ElemVars = getOrCreateGCacheEntry(Strategy, Elem, GetVariable); - Vec.append(ElemVars.begin(), ElemVars.end()); - } - return Vec; - } - - // TODO: more - - return Vec; - } -}; - struct PAGMappedLibrarySummary { const library_summary::LLVMFunctionDataFlowFacts &Facts; // NOLINT @@ -137,60 +73,6 @@ struct PAGMappedLibrarySummary { } }; -// Bundle of per-function analyses for the built-in MemorySSA provider. -// Members are declared in initialization order: each field depends only on -// the ones before it. MSSA is constructed last in the body (after -// AA.addAAResult) because MemorySSA is neither movable nor copyable. -struct MemSSABundle { - llvm::AssumptionCache AC; - llvm::DominatorTree DT; - llvm::BasicAAResult BAA; - llvm::AAResults AA; - llvm::MemorySSA MSSA; - - explicit MemSSABundle(llvm::Function &F, const llvm::TargetLibraryInfo *TLI) - : AC(F), DT(F), - BAA(F.getParent()->getDataLayout(), F, assertNotNull(TLI), AC, &DT), - AA([](const auto *TLI, auto *BAA) { - llvm::AAResults AA(*TLI); - AA.addAAResult(*BAA); - return AA; - }(TLI, &BAA)), - MSSA(F, &AA, &DT) {} -}; - -// returns HasLiveOnEntry -static bool -collectReachingDefs(llvm::MemoryAccess *MA, const llvm::MemorySSA &MSSA, - llvm::SmallPtrSetImpl &Defs, - llvm::SmallPtrSetImpl &Visited) { - if (!Visited.insert(MA).second) { - return false; - } - if (MSSA.isLiveOnEntryDef(MA)) { - return true; - } - if (auto *Def = llvm::dyn_cast(MA)) { - // We only care about stores for now - if (const auto *St = - llvm::dyn_cast(Def->getMemoryInst())) { - Defs.insert(St); - return false; - } - return true; - } - if (auto *Phi = llvm::dyn_cast(MA)) { - for (const auto &Inc : Phi->incoming_values()) { - bool LOE = collectReachingDefs(llvm::cast(Inc.get()), - MSSA, Defs, Visited); - if (LOE) { - return true; - } - } - } - return false; -} - } // namespace struct [[clang::internal_linkage]] LLVMPAGBuilder::PAGBuildData { @@ -270,7 +152,7 @@ struct [[clang::internal_linkage]] LLVMPAGBuilder::PAGBuildData { void initializeGlobals(const LLVMProjectIRDB &IRDB, LLVMPBStrategyRef Strategy) { - GlobalCache GCache{IRDB.getModule()->getDataLayout()}; + GlobalInitCache GCache; for (const auto &Glob : IRDB.getModule()->globals()) { if (definitelyContainsNoPointer(Glob.getValueType())) { @@ -283,14 +165,13 @@ struct [[clang::internal_linkage]] LLVMPAGBuilder::PAGBuildData { } } - void initializeGlobal(GlobalCache &GCache, LLVMPBStrategyRef Strategy, + void initializeGlobal(GlobalInitCache &GCache, LLVMPBStrategyRef Strategy, const llvm::GlobalVariable &Glob) { auto GlobObj = getVariable(&Glob, Strategy); - auto Stores = GCache.getOrCreateGCacheEntry( - Strategy, Glob.getInitializer(), - [this](const llvm::Value *V, LLVMPBStrategyRef Strategy) { - return getVariable(V, Strategy); - }); + auto Stores = GCache.getOrCreate(Glob.getInitializer(), + [this, Strategy](const llvm::Value *V) { + return getVariable(V, Strategy); + }); for (auto Src : Stores) { // NOTE: We don't consider this a POI for now; probably, that's fine @@ -421,47 +302,8 @@ struct [[clang::internal_linkage]] LLVMPAGBuilder::PAGBuildData { static void handleOperand(const llvm::Value *RawOp, std::invocable auto Handler) { - RawOp = RawOp->stripPointerCastsAndAliases(); - const auto *RawOpCExpr = llvm::dyn_cast(RawOp); - if (!RawOpCExpr) [[likely]] { - // fast-path: - return (void)std::invoke(Handler, RawOp); - } - - llvm::SmallDenseSet Seen = {RawOp}; - llvm::SmallVector WL = {RawOpCExpr}; - do { - const auto *Curr = WL.pop_back_val(); - for (const auto *Op : Curr->operand_values()) { - if (definitelyContainsNoPointer(Op) || !Seen.insert(Op).second) { - continue; - } - - if (const auto *GObj = llvm::dyn_cast(Op)) { - std::invoke(Handler, GObj); - continue; - } - - if (const auto *GEPOp = llvm::dyn_cast(Op)) { - const auto *PtrOp = GEPOp->getPointerOperand(); - if (!definitelyContainsNoPointer(PtrOp) && - Seen.insert(PtrOp).second) { - if (const auto *PtrUser = llvm::dyn_cast(PtrOp)) { - WL.push_back(PtrUser); - } else { - std::invoke(Handler, PtrOp); - } - } - continue; - } - - if (const auto *OpUser = llvm::dyn_cast(Op)) { - WL.push_back(OpUser); - continue; - } - } - - } while (!WL.empty()); + // TODO: Handle constant GEP! + psr::forEachPointerOperand(RawOp, copyOrRef(Handler)); } void handleStore(LLVMPBStrategyRef Strategy, const llvm::StoreInst *Store) { @@ -485,33 +327,33 @@ struct [[clang::internal_linkage]] LLVMPAGBuilder::PAGBuildData { } if (CurrentMemSSA) { - if (auto *Access = CurrentMemSSA->getMemoryAccess(Ld)) { - auto *Clobber = - CurrentMemSSA->getWalker()->getClobberingMemoryAccess(Access); - llvm::SmallPtrSet Defs; - llvm::SmallPtrSet Visited; - const bool HasLiveOnEntry = - collectReachingDefs(Clobber, *CurrentMemSSA, Defs, Visited); - - if (!HasLiveOnEntry) { - - if (Defs.size() == 1) { - const auto *ValueOp = (*Defs.begin())->getValueOperand(); - if (!llvm::isa(ValueOp)) { - VC.addAlias(Ld, getVariable(ValueOp, Strategy)); - return; - } + llvm::SmallPtrSet Defs; + const bool HasLiveOnEntry = collectReachingDefs(Ld, *CurrentMemSSA, Defs); + if (!HasLiveOnEntry) { + if (Defs.size() == 1) { + const auto *ValueOp = (*Defs.begin())->getValueOperand(); + if (!llvm::isa(ValueOp) && + !definitelyContainsNoPointer(ValueOp)) { + VC.addAlias(Ld, getVariable(ValueOp, Strategy)); + return; } + } - auto LoadObj = getVariable(Ld, Strategy); - for (const auto *Def : Defs) { - handleOperand(Def->getValueOperand(), [&](const auto *ValOp) { - Strategy.onAddEdge(getVariable(ValOp, Strategy), LoadObj, - Assign{}, Ld); - }); - } + auto LoadObj = getVariable(Ld, Strategy); + bool AddedAny = false; + for (const auto *Def : Defs) { + handleOperand(Def->getValueOperand(), [&](const auto *ValOp) { + Strategy.onAddEdge(getVariable(ValOp, Strategy), LoadObj, Assign{}, + Ld); + AddedAny = true; + }); + } + + if (AddedAny) { return; } + // All reaching stores have non-pointer value operands: + // fall through to addEdge. } } diff --git a/lib/PhasarLLVM/Pointer/MemSSAUtils.cpp b/lib/PhasarLLVM/Pointer/MemSSAUtils.cpp new file mode 100644 index 0000000000..0d1f024ef9 --- /dev/null +++ b/lib/PhasarLLVM/Pointer/MemSSAUtils.cpp @@ -0,0 +1,76 @@ +/****************************************************************************** + * Copyright (c) 2026 Fabian Schiebel. + * All rights reserved. This program and the accompanying materials are made + * available under the terms of LICENSE.txt. + * + * Contributors: + * Fabian Schiebel and others + *****************************************************************************/ + +#include "phasar/PhasarLLVM/Pointer/MemSSAUtils.h" + +#include "phasar/Utils/Utilities.h" + +using namespace psr; + +MemSSABundle::MemSSABundle(llvm::Function &F, + const llvm::TargetLibraryInfo *TLI) + : AC(F), DT(F), TBAA( +#if LLVM_VERSION_MAJOR > 19 + /*UsingTypeSanitizer=*/false +#endif + ), + SNA(), + BAA(F.getParent()->getDataLayout(), F, assertNotNull(TLI), AC, &DT), + AA([](const auto *TLI, auto *TBAA, auto *SNA, auto *BAA) { + llvm::AAResults AA(*TLI); + AA.addAAResult(*TBAA); + AA.addAAResult(*SNA); + AA.addAAResult(*BAA); + return AA; + }(TLI, &TBAA, &SNA, &BAA)), + MSSA(F, &AA, &DT) { +} + +bool psr::collectReachingDefs( + llvm::MemoryAccess *MA, const llvm::MemorySSA &MSSA, + llvm::SmallPtrSetImpl &ReachingDefs, + llvm::SmallPtrSetImpl &Visited) { + if (!Visited.insert(MA).second) { + return false; + } + if (MSSA.isLiveOnEntryDef(MA)) { + return true; + } + if (auto *Def = llvm::dyn_cast(MA)) { + // We only care about stores for now + if (const auto *St = + llvm::dyn_cast(Def->getMemoryInst())) { + ReachingDefs.insert(St); + return false; + } + return true; + } + if (auto *Phi = llvm::dyn_cast(MA)) { + for (const auto &Inc : Phi->incoming_values()) { + bool LOE = collectReachingDefs(llvm::cast(Inc.get()), + MSSA, ReachingDefs, Visited); + if (LOE) { + return true; + } + } + } + return false; +} + +bool psr::collectReachingDefs( + const llvm::LoadInst *Load, llvm::MemorySSA &MSSA, + llvm::SmallPtrSetImpl &ReachingDefs) { + if (auto *Access = MSSA.getMemoryAccess(Load)) { + auto *Clobber = MSSA.getWalker()->getClobberingMemoryAccess(Access); + llvm::SmallPtrSet Visited; + return collectReachingDefs(Clobber, MSSA, ReachingDefs, Visited); + } + + return true; +} diff --git a/lib/PhasarLLVM/Pointer/Pointer.cppm b/lib/PhasarLLVM/Pointer/Pointer.cppm index 92661ac141..4ee1f0d16e 100644 --- a/lib/PhasarLLVM/Pointer/Pointer.cppm +++ b/lib/PhasarLLVM/Pointer/Pointer.cppm @@ -1,33 +1,47 @@ module; -#include "phasar/Config/phasar-config.h" -#include "phasar/PhasarLLVM/Pointer/AliasAnalysisView.h" -#include "phasar/PhasarLLVM/Pointer/FilteredLLVMAliasSet.h" -#include "phasar/PhasarLLVM/Pointer/LLVMAliasInfo.h" -#include "phasar/PhasarLLVM/Pointer/LLVMAliasSet.h" -#include "phasar/PhasarLLVM/Pointer/LLVMAliasSetData.h" -#include "phasar/PhasarLLVM/Pointer/LLVMPointsToInfo.h" -#include "phasar/PhasarLLVM/Pointer/LLVMPointsToUtils.h" - -#ifdef PHASAR_USE_SVF -#include "phasar/PhasarLLVM/Pointer/SVF/SVFPointsToSet.h" -#endif +#include "phasar/PhasarLLVM/Pointer.h" export module phasar.llvm.pointer; export namespace psr { using psr::AliasAnalysisView; using psr::AliasInfoTraits; +using psr::AndersenOTFResult; +using psr::AndersenOTFSolver; +using psr::collectReachingDefs; +using psr::computeAndersenOTF; +using psr::computeAndersenOTFRaw; +using psr::computeBotCtxIndSensUnionFindAA; +using psr::computeBotCtxIndSensUnionFindAARaw; +using psr::computeBotCtxSensUnionFindAA; +using psr::computeBotCtxSensUnionFindAARaw; +using psr::computeCtxIndSensUnionFindAA; +using psr::computeCtxIndSensUnionFindAARaw; +using psr::computeCtxSensUnionFindAA; +using psr::computeCtxSensUnionFindAARaw; +using psr::computeIndSensUnionFindAA; +using psr::computeIndSensUnionFindAARaw; +using psr::computeUnionFindAA; +using psr::computeUnionFindAARaw; using psr::FilteredLLVMAliasSet; using psr::FunctionAliasView; +using psr::GlobalInitCache; using psr::isInterestingPointer; using psr::LLVMAliasInfo; using psr::LLVMAliasInfoRef; using psr::LLVMAliasIteratorRef; using psr::LLVMAliasSet; using psr::LLVMAliasSetData; +using psr::LLVMLocalUnionFindAliasIterator; +using psr::LLVMLocalUnionFindAliasIteratorMixin; using psr::LLVMPointsToIterator; using psr::LLVMPointsToIteratorRef; +using psr::llvmUnionFindAliasHandler; +using psr::LLVMUnionFindAliasIterator; +using psr::LLVMUnionFindAliasIteratorMixin; +using psr::MemSSABundle; +using psr::pag::LLVMCGProvider; #ifdef PHASAR_USE_SVF using psr::createLLVMSVFPointsToIterator; diff --git a/lib/PhasarLLVM/Utils/LLVMShorthands.cpp b/lib/PhasarLLVM/Utils/LLVMShorthands.cpp index 7b13dd2194..9f8f2bbca8 100644 --- a/lib/PhasarLLVM/Utils/LLVMShorthands.cpp +++ b/lib/PhasarLLVM/Utils/LLVMShorthands.cpp @@ -746,3 +746,34 @@ const llvm::DIType *psr::stripPointerTypes(const llvm::DIType *DITy) { } return DITy; } + +const llvm::Function *psr::walkConstInitPath(const llvm::Constant *Init, + llvm::ArrayRef Indices) { + if (Indices.empty()) { + return llvm::dyn_cast(Init->stripPointerCastsAndAliases()); + } + const uint64_t Idx0 = Indices[0]; + const llvm::Constant *Elem = nullptr; + if (const auto *CA = llvm::dyn_cast(Init)) { + if (Idx0 >= CA->getNumOperands()) { + return nullptr; + } + Elem = CA->getOperand(Idx0); + } else if (llvm::isa(Init)) { + if (Idx0 != 0) { + return nullptr; + } + Elem = Init; // struct: idx0 is pointer-arithmetic no-op, stay here + } else { + return nullptr; + } + for (const uint64_t Idx : Indices.drop_front(1)) { + const auto *Agg = llvm::dyn_cast(Elem); + if (!Agg || Idx >= Agg->getNumOperands()) { + return nullptr; + } + Elem = Agg->getOperand(Idx); + } + return llvm::dyn_cast_or_null( + Elem->stripPointerCastsAndAliases()); +} diff --git a/lib/PhasarLLVM/Utils/VirtualCallUtils.cpp b/lib/PhasarLLVM/Utils/VirtualCallUtils.cpp new file mode 100644 index 0000000000..87cd85b0dc --- /dev/null +++ b/lib/PhasarLLVM/Utils/VirtualCallUtils.cpp @@ -0,0 +1,135 @@ +#include "phasar/PhasarLLVM/Utils/VirtualCallUtils.h" + +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Operator.h" + +using namespace psr; + +std::optional psr::getVFTIndex(const llvm::CallBase *CallSite) { + // deal with a virtual member function + // retrieve the vtable entry that is called + const auto *Load = + llvm::dyn_cast(CallSite->getCalledOperand()); + if (Load == nullptr) { + return std::nullopt; + } + const auto *GEP = + llvm::dyn_cast(Load->getPointerOperand()); + if (GEP == nullptr) { + return std::nullopt; + } + if (auto *CI = llvm::dyn_cast(GEP->getOperand(1))) { + return CI->getZExtValue(); + } + return std::nullopt; +} + +std::optional> +psr::getVFTIndexAndVT(const llvm::CallBase *CallSite) { + // deal with a virtual member function + // retrieve the vtable entry that is called + const auto *Load = + llvm::dyn_cast(CallSite->getCalledOperand()); + if (Load == nullptr) { + return std::nullopt; + } + + const auto *GEP = + llvm::dyn_cast(Load->getPointerOperand()); + // Vtable GEPs index into a pointer array with a single index. + // Multi-index GEPs (e.g. struct field access) are not vtable patterns. + if (GEP == nullptr || GEP->getNumOperands() != 2) { + return std::nullopt; + } + + if (auto *CI = llvm::dyn_cast(GEP->getOperand(1))) { + return {{GEP->getPointerOperand(), CI->getZExtValue()}}; + } + + return std::nullopt; +} + +std::optional, + llvm::Type *>> +psr::getStructVCallInfo(const llvm::CallBase *CallSite) { + const auto *Load = + llvm::dyn_cast(CallSite->getCalledOperand()); + if (!Load) { + return std::nullopt; + } + const auto *GEP = + llvm::dyn_cast(Load->getPointerOperand()); + if (!GEP || GEP->getNumOperands() < 3 || !GEP->hasAllConstantIndices()) { + return std::nullopt; + } + llvm::SmallVector Indices; + for (const llvm::Use &Idx : GEP->indices()) { + Indices.push_back(llvm::cast(Idx.get())->getZExtValue()); + } + return {{GEP->getPointerOperand(), std::move(Indices), + GEP->getSourceElementType()}}; +} + +bool psr::isConsistentCall(const llvm::CallBase *CallSite, + const llvm::Function *DestFun) { + if (CallSite->arg_size() < DestFun->arg_size()) { + return false; + } + if (CallSite->arg_size() != DestFun->arg_size() && !DestFun->isVarArg()) { + return false; + } + + for (const auto &[Param, ArgOp] : + llvm::zip_first(DestFun->args(), CallSite->args())) { + + const auto *ParamTy = Param.getType(); + const auto *ArgTy = ArgOp->getType(); + + if (ParamTy == ArgTy) { + // Trivial equality + continue; + } + + if (ParamTy->getTypeID() != ArgTy->getTypeID()) { + // Trivial non-equality, e.g. PointerType and IntegerType + return false; + } + + if (ParamTy->isPointerTy()) { + if (Param.hasByValAttr() != + CallSite->isByValArgument(ArgOp.getOperandNo())) { + return false; + } + + const auto *ParamSRetTy = Param.getParamStructRetType(); + const auto *ArgSRetTy = + CallSite->getParamStructRetType(ArgOp.getOperandNo()); + if ((ParamSRetTy != nullptr) != (ArgSRetTy != nullptr)) { + return false; + } + + if (ParamSRetTy && ArgSRetTy) { + // TODO: For better precision, compare the sret types as well + // Trivial non-equality, e.g. PointerType and IntegerType + if (ParamSRetTy->getTypeID() != ArgSRetTy->getTypeID()) { + // Trivial non-equality, e.g. PointerType and IntegerType + return false; + } + } + } + + if (ParamTy->isStructTy()) { + // Copied comment from struct-case in isTypeMatchForFunctionArgument(): + // > Well, we could do sanity checks here, but if the analysed code is + // > insane we would miss callees, so we don't do that. + + continue; + } + + // Types are non-equal and we could not find a reason to treat the same + return false; + } + + return true; +} diff --git a/lib/Pointer/CMakeLists.txt b/lib/Pointer/CMakeLists.txt index 66b1d2710f..6be2f9d0c2 100644 --- a/lib/Pointer/CMakeLists.txt +++ b/lib/Pointer/CMakeLists.txt @@ -10,6 +10,9 @@ add_phasar_library(phasar_pointer LLVM_LINK_COMPONENTS Support + LINK_PUBLIC + roaring::roaring + MODULE_FILES PhasarPointer.cppm ) diff --git a/test/llvm_test_code/pointers/CMakeLists.txt b/test/llvm_test_code/pointers/CMakeLists.txt index 1bb0a06f3c..5d84133448 100644 --- a/test/llvm_test_code/pointers/CMakeLists.txt +++ b/test/llvm_test_code/pointers/CMakeLists.txt @@ -1,4 +1,13 @@ set(lca_files + andersen_otf_interproc.c + andersen_otf_fp.c + andersen_otf_global_init.c + andersen_otf_merge_load.c + andersen_otf_fp_already_processed.c + andersen_otf_extern_callback.c + andersen_otf_fp_struct_field.c + andersen_otf_vtable.cpp + andersen_otf_vtable2.cpp basic_01.c basic_02.c basic_03.c @@ -47,6 +56,10 @@ set(lca_files ) set(lca_files_mem2reg + andersen_otf_interproc.c + andersen_otf_fp.c + andersen_otf_libc.c + andersen_otf_struct_vtable.c basic_01.c basic_02.c basic_03.c diff --git a/test/llvm_test_code/pointers/andersen_otf_extern_callback.c b/test/llvm_test_code/pointers/andersen_otf_extern_callback.c new file mode 100644 index 0000000000..b6b08eb29c --- /dev/null +++ b/test/llvm_test_code/pointers/andersen_otf_extern_callback.c @@ -0,0 +1,15 @@ +// Soundness test: close_stdout is passed as a fn-ptr arg to the external +// register_callback (a declaration). At Soundy/Sound, the solver must +// treat close_stdout as a reachable entry point and analyse its body, +// discovering flush_impl as a callee. At Unsound neither should appear. +void flush_impl(void) {} + +void close_stdout(void) { flush_impl(); } + +// External: only a declaration, body not available in this module. +void register_callback(void (*f)(void)); + +int main(void) { + register_callback(close_stdout); + return 0; +} diff --git a/test/llvm_test_code/pointers/andersen_otf_fp.c b/test/llvm_test_code/pointers/andersen_otf_fp.c new file mode 100644 index 0000000000..cb0155a614 --- /dev/null +++ b/test/llvm_test_code/pointers/andersen_otf_fp.c @@ -0,0 +1,14 @@ +// On-the-fly function-pointer resolution: +// id is stored into fp and then called indirectly. The OTF fixpoint must +// discover id as a callee and propagate the alias between its formal +// parameter and its return value. +static int *id(int *x) { return x; } + +int main() { + int a; + int *p = &a; + int *(*fp)(int *) = id; + int *q = fp(p); + (void)q; + return 0; +} diff --git a/test/llvm_test_code/pointers/andersen_otf_fp_already_processed.c b/test/llvm_test_code/pointers/andersen_otf_fp_already_processed.c new file mode 100644 index 0000000000..ce08b548c8 --- /dev/null +++ b/test/llvm_test_code/pointers/andersen_otf_fp_already_processed.c @@ -0,0 +1,51 @@ +// Demonstrates Bug 2: outer fixpoint exits when FunctionWorklist is empty +// even though checkUnresolvedFPCalls just grew pts for a call site that was +// already examined earlier in the same pass. +// +// Processing order (LIFO FunctionWorklist; main pushes D, A, B): +// pop B → call2 (g_fp2()) deferred, pts={}. +// pop A → call1 (g_fp1(get_y)) deferred, pts={} (D not yet run). +// pop D → relay processed (g_fp2=get_x), g_fp1=relay set. +// After D, propagation: pts(g_fp2_load)={get_x}, pts(g_fp1_load)={relay}. +// +// checkUnresolvedFPCalls: [call2, call1] +// call2: pts(g_fp2_load)={get_x} → connects get_x. ret(B) gets x. +// call1: pts(g_fp1_load)={relay} → connects relay with arg get_y +// → relay already processed → propagate → g_fp2 gains get_y. +// FunctionWorklist still empty → outer loop exits. call2 re-check skipped. +// +// Expected (sound): ret(B) must alias both x and y. + +int x, y; + +static int *get_x(void) { return &x; } +static int *get_y(void) { return &y; } + +static int *(*g_fp2)(void); +static void (*g_fp1)(int *(*)(void)); + +static void relay(int *(*cb)(void)) { g_fp2 = cb; } + +// Processed first (B pushed last by main). +// g_fp2 is still unset, so call2 deferred with pts={}. +static int *B(void) { return g_fp2(); } + +// Processed second (A pushed second by main). +// g_fp1 is still unset (D not yet run), so call1 deferred with pts={}. +static void A(void) { g_fp1(get_y); } + +// Processed third (D pushed first by main). +// Ensures relay, get_x, get_y are all processed before checkUnresolved runs. +static void D(void) { + get_x(); + get_y(); + relay(get_x); + g_fp1 = relay; +} + +int main(void) { + D(); // pushed first → bottom of stack → processed third + A(); // pushed second → processed second + B(); // pushed third → top of stack → processed first + return 0; +} diff --git a/test/llvm_test_code/pointers/andersen_otf_fp_struct_field.c b/test/llvm_test_code/pointers/andersen_otf_fp_struct_field.c new file mode 100644 index 0000000000..ab2414b2cf --- /dev/null +++ b/test/llvm_test_code/pointers/andersen_otf_fp_struct_field.c @@ -0,0 +1,22 @@ +// Test: function pointer stored in a struct field via an initializer function, +// then retrieved and called indirectly. Mirrors the obstack chunkfun pattern. +// Expected: the indirect call in do_call() must have target() as a callee. + +struct Ctx { + void *(*fn)(void *); +}; + +static void *target(void *arg) { return arg; } + +static void init_ctx(struct Ctx *ctx, void *(*fn)(void *)) { ctx->fn = fn; } + +static void *do_call(struct Ctx *ctx, void *arg) { + return ctx->fn(arg); // indirect call via struct field +} + +int main(void) { + struct Ctx ctx; + init_ctx(&ctx, target); + do_call(&ctx, (void *)0); + return 0; +} diff --git a/test/llvm_test_code/pointers/andersen_otf_global_init.c b/test/llvm_test_code/pointers/andersen_otf_global_init.c new file mode 100644 index 0000000000..3f4ddbe1d0 --- /dev/null +++ b/test/llvm_test_code/pointers/andersen_otf_global_init.c @@ -0,0 +1,9 @@ +// Global pointer @p is initialised to &@x. +// Loading from @p must yield a pointer that aliases @x (Bug 2 soundness). +int x = 0; +int *p = &x; + +int main() { + int *q = p; + return 0; +} diff --git a/test/llvm_test_code/pointers/andersen_otf_interproc.c b/test/llvm_test_code/pointers/andersen_otf_interproc.c new file mode 100644 index 0000000000..b8643bc22a --- /dev/null +++ b/test/llvm_test_code/pointers/andersen_otf_interproc.c @@ -0,0 +1,12 @@ +// Direct interprocedural alias propagation: +// retptr returns its argument, so the formal param and the return value +// share the same points-to set. +static int *retptr(int *x) { return x; } + +int main() { + int a; + int *p = &a; + int *q = retptr(p); + (void)q; + return 0; +} diff --git a/test/llvm_test_code/pointers/andersen_otf_libc.c b/test/llvm_test_code/pointers/andersen_otf_libc.c new file mode 100644 index 0000000000..1d7ab0440e --- /dev/null +++ b/test/llvm_test_code/pointers/andersen_otf_libc.c @@ -0,0 +1,12 @@ +// strcpy(dst, src) library summary: +// param 0 (dst) -> ReturnValue => ret aliases dst +// param 1 (src) -> Parameter{0} => *dst = src +// The return value of strcpy must alias buf (arg 0). +#include + +int main(void) { + char buf[64]; + char *p = strcpy(buf, "hello"); + (void)p; + return 0; +} diff --git a/test/llvm_test_code/pointers/andersen_otf_merge_load.c b/test/llvm_test_code/pointers/andersen_otf_merge_load.c new file mode 100644 index 0000000000..5a12a1afc5 --- /dev/null +++ b/test/llvm_test_code/pointers/andersen_otf_merge_load.c @@ -0,0 +1,20 @@ +// h->f->h cycle; h returns *p (the load result). +// After both h(&px) and h(&py), h's return value must alias x and y. +static int *f(int **p); + +static int *h(int **p) { + f(p); + return *p; +} + +static int *f(int **p) { return h(p); } + +int main() { + int x = 0; + int y = 0; + int *px = &x; + int *py = &y; + h(&px); + h(&py); + return 0; +} diff --git a/test/llvm_test_code/pointers/andersen_otf_struct_vtable.c b/test/llvm_test_code/pointers/andersen_otf_struct_vtable.c new file mode 100644 index 0000000000..b5c8d3beae --- /dev/null +++ b/test/llvm_test_code/pointers/andersen_otf_struct_vtable.c @@ -0,0 +1,20 @@ +// Test: hand-rolled C vtable via const struct global. +// ops->write(...) must resolve precisely to myWrite, not myRead. +// Field-insensitive analysis would add both; the struct-vtable path +// reads the initializer at the specific field index. + +static int myRead(void *ctx) { return 0; } +static int myWrite(void *ctx, int v) { return v; } + +struct Ops { + int (*read)(void *); + int (*write)(void *, int); +}; + +static const struct Ops myOps = {myRead, myWrite}; + +int dispatch(const struct Ops *ops, void *ctx, int v) { + return ops->write(ctx, v); +} + +int main(void) { return dispatch(&myOps, 0, 42); } diff --git a/test/llvm_test_code/pointers/andersen_otf_vtable.cpp b/test/llvm_test_code/pointers/andersen_otf_vtable.cpp new file mode 100644 index 0000000000..9fa9eca0f4 --- /dev/null +++ b/test/llvm_test_code/pointers/andersen_otf_vtable.cpp @@ -0,0 +1,16 @@ +// Virtual dispatch via a pointer forces the vtable lookup path. +// call_get's return value must alias @x (returned by A::get). +struct A { + virtual int *get(); +}; + +int x; +int *A::get() { return &x; } + +static int *call_get(A *a) { return a->get(); } + +int main() { + A a; + int *p = call_get(&a); + return 0; +} diff --git a/test/llvm_test_code/pointers/andersen_otf_vtable2.cpp b/test/llvm_test_code/pointers/andersen_otf_vtable2.cpp new file mode 100644 index 0000000000..4ea6f652eb --- /dev/null +++ b/test/llvm_test_code/pointers/andersen_otf_vtable2.cpp @@ -0,0 +1,22 @@ +// Two virtual methods in the same vtable. +// call_getX (slot 0) must alias @x; call_getY (slot 1) must alias @y. +// With imprecise (all-slots) vtable handling both rets would alias both +// globals; the slot-specific path must keep them separate. +struct B { + virtual int *getX(); + virtual int *getY(); +}; + +int x, y; +int *B::getX() { return &x; } +int *B::getY() { return &y; } + +static int *call_getX(B *b) { return b->getX(); } +static int *call_getY(B *b) { return b->getY(); } + +int main() { + B b; + int *px = call_getX(&b); + int *py = call_getY(&b); + return 0; +} diff --git a/unittests/PhasarLLVM/DataFlow/IfdsIde/Problems/IFDSConstAnalysisTest.cpp b/unittests/PhasarLLVM/DataFlow/IfdsIde/Problems/IFDSConstAnalysisTest.cpp index 68402e4275..6badb8e24d 100644 --- a/unittests/PhasarLLVM/DataFlow/IfdsIde/Problems/IFDSConstAnalysisTest.cpp +++ b/unittests/PhasarLLVM/DataFlow/IfdsIde/Problems/IFDSConstAnalysisTest.cpp @@ -8,7 +8,9 @@ #include "phasar/PhasarLLVM/Pointer/LLVMAliasSet.h" #include "phasar/PhasarLLVM/SimpleAnalysisConstructor.h" #include "phasar/PhasarLLVM/Utils/LLVMShorthands.h" +#include "phasar/Utils/DebugOutput.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/Casting.h" @@ -71,7 +73,8 @@ class IFDSConstAnalysisTest : public ::testing::Test { } } - EXPECT_EQ(GroundTruth, AllMutableAllocas); + EXPECT_EQ(GroundTruth, AllMutableAllocas) + << " Expected " << PrettyPrinter{GroundTruth}; } void compareResults(const std::set &GroundTruth, diff --git a/unittests/PhasarLLVM/Pointer/AndersenOTFAATest.cpp b/unittests/PhasarLLVM/Pointer/AndersenOTFAATest.cpp new file mode 100644 index 0000000000..d941b3e4d3 --- /dev/null +++ b/unittests/PhasarLLVM/Pointer/AndersenOTFAATest.cpp @@ -0,0 +1,1121 @@ +#include "phasar/PhasarLLVM/Pointer/AndersenOTFAA.h" + +#include "phasar/PhasarLLVM/DB/LLVMProjectIRDB.h" +#include "phasar/PhasarLLVM/Pointer/LLVMPointerAssignmentGraph.h" +#include "phasar/Pointer/RawAliasSet.h" +#include "phasar/Pointer/UnionFindAA.h" +#include "phasar/Utils/DebugOutput.h" +#include "phasar/Utils/IotaIterator.h" +#include "phasar/Utils/ValueCompressor.h" + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/Support/raw_ostream.h" + +#include "SrcCodeLocationEntry.h" +#include "TestConfig.h" +#include "gtest/gtest.h" + +#include +#include +#include +#include + +namespace { +using namespace psr; +using namespace psr::unittest; + +static_assert(UnionFindAAResult); + +constexpr auto PathToLLFiles = PHASAR_BUILD_SUBFOLDER("pointers/"); + +using TSL = TestingSrcLocation; +using GTMap = std::map>; + +[[nodiscard]] ValueId asId(const ValueCompressor &Compressor, + const LLVMProjectIRDB &IRDB, TSL Var) { + const auto *LLVMVar = testingLocInIR(Var, IRDB); + auto MaybeId = Compressor.getOrNull(LLVMVar); + if (!MaybeId) { + ADD_FAILURE() << "Value not in VC: " << Var; + return ValueId{}; + } + return *MaybeId; +} + +[[nodiscard]] std::string +stringifyVal(const ValueCompressor &Compressor, ValueId VId) { + std::string Ret; + llvm::raw_string_ostream ROS(Ret); + ROS << "{ "; + llvm::interleaveComma(Compressor.id2vars(VId), ROS, + [&](PAGVariable Var) { ROS << to_string(Var); }); + ROS << " }"; + return Ret; +} + +void dumpAnalysisState(const ValueCompressor &Compressor, + const AndersenOTFResult &Results) { + llvm::errs() << "ValueCompressor: {\n"; + for (const auto &[VId, Values] : Compressor.id2vars().enumerate()) { + llvm::errs() << " #" << uint32_t(VId) << ":\n"; + for (const auto Val : Values) { + llvm::errs() << " " << to_string(Val) << '\n'; + } + } + llvm::errs() << "}\n"; + llvm::errs() << "AliasSets: {\n"; + for (const auto &[VId, Aliases] : Results.AliasSets.enumerate()) { + bool First = true; + for (const auto &Var : Compressor.id2vars(VId)) { + llvm::errs() << " " << to_string(Var); + + if (First) { + First = false; + } else { + llvm::errs() << " MUST ALIAS with " + << to_string(*Compressor.id2vars(VId).begin()) << '\n'; + continue; + } + + if (Aliases.empty()) { + llvm::errs() << " aliases: EMPTY\n"; + continue; + } + + llvm::errs() << " aliases: {\n"; + Aliases.foreach ([&](ValueId AId) { + llvm::errs() << " " << stringifyVal(Compressor, AId) << '\n'; + }); + llvm::errs() << " }\n"; + } + } + llvm::errs() << "}\n"; +} + +constexpr llvm::StringRef EntryNames[] = {"main"}; + +/// Exact bidirectional GT check. +/// +/// Soundness: every alias listed in the GT must appear in the computed set. +/// Precision: no computed alias that is named in the GT (the "domain") may +/// be absent from the expected set. Values not named in the GT are outside +/// the domain and are not subject to the precision check. +void doAnalysisAndCheckExact( + const llvm::Twine &IRFile, const GTMap &ExpectedResults, + bool DumpResults = false, + std::source_location Loc = std::source_location::current()) { + + auto IRDB = LLVMProjectIRDB::loadOrExit(PathToLLFiles + IRFile); + + llvm::SmallVector Entries; + for (llvm::StringRef Name : EntryNames) { + const auto *Func = IRDB.getFunctionDefinition(Name); + if (!Func) { + ADD_FAILURE_AT(Loc.file_name(), Loc.line()) + << "Entry function not found: " << Name.str(); + return; + } + Entries.push_back(Func); + } + + ValueCompressor Compressor; + AndersenOTFResult Results = computeAndersenOTFRaw(IRDB, Entries, &Compressor); + + // Build domain from all values explicitly named in the GT. + llvm::SmallDenseSet Domain; + for (const auto &[PtrVar, ExpectedAliasVars] : ExpectedResults) { + Domain.insert(asId(Compressor, IRDB, PtrVar)); + for (const auto &AliasVar : ExpectedAliasVars) { + Domain.insert(asId(Compressor, IRDB, AliasVar)); + } + } + + for (const auto &[PtrVar, ExpectedAliasVars] : ExpectedResults) { + const auto PtrId = asId(Compressor, IRDB, PtrVar); + const auto &Computed = Results.getRawAliasSet(PtrId); + + RawAliasSet Expected; + // llvm::errs() << "For PtrId: #" << uint32_t(PtrId) << ":\n"; + for (const auto &AliasVar : ExpectedAliasVars) { + auto AliasId = asId(Compressor, IRDB, AliasVar); + Expected.insert(AliasId); + // llvm::errs() << "> Insert #" << uint32_t(AliasId) + // << " into Expected due to " << AliasVar << '\n'; + } + + // Soundness. + Expected.foreach ([&](ValueId AliasId) { + if (!Computed.contains(AliasId)) { + ADD_FAILURE_AT(Loc.file_name(), Loc.line()) + << "Missing expected alias of " << PtrVar << "(#" << uint32_t(PtrId) + << "): #" << uint32_t(AliasId) << " as " + << stringifyVal(Compressor, AliasId); + } + }); + + // Precision (domain-restricted). + Computed.foreach ([&](ValueId VId) { + if (!Domain.contains(VId) || Expected.contains(VId)) { + return; + } + ADD_FAILURE_AT(Loc.file_name(), Loc.line()) + << "Unexpected alias of " << PtrVar << ": " + << stringifyVal(Compressor, VId); + }); + } + + if (DumpResults || ::testing::Test::HasFailure()) { + dumpAnalysisState(Compressor, Results); + } +} + +// ---- Tests ---------------------------------------------------------------- + +TEST(AndersenOTFAATest, InterProcArgRetAlias) { + // retptr(x) returns x — formal parameter and return value must alias. + const GTMap ExpectedResults = { + {TSL(ArgInFun{.Idx = 0, .InFunction = "retptr"}), + {TSL(ArgInFun{.Idx = 0, .InFunction = "retptr"}), + TSL(RetVal{.InFunction = "retptr"})}}, + {TSL(RetVal{.InFunction = "retptr"}), + {TSL(ArgInFun{.Idx = 0, .InFunction = "retptr"}), + TSL(RetVal{.InFunction = "retptr"})}}, + }; + doAnalysisAndCheckExact("andersen_otf_interproc_c_m2r_dbg.ll", + ExpectedResults); +} + +TEST(AndersenOTFAATest, FuncPtrArgRetAlias) { + // id(x) returns x, called only via function pointer. + // OTF must discover id as a callee and propagate arg/ret alias. + const GTMap ExpectedResults = { + {TSL(ArgInFun{.Idx = 0, .InFunction = "id"}), + {TSL(ArgInFun{.Idx = 0, .InFunction = "id"}), + TSL(RetVal{.InFunction = "id"})}}, + {TSL(RetVal{.InFunction = "id"}), + {TSL(ArgInFun{.Idx = 0, .InFunction = "id"}), + TSL(RetVal{.InFunction = "id"})}}, + }; + doAnalysisAndCheckExact("andersen_otf_fp_c_m2r_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, FuncByNameInVC) { + // The function 'id' has its address stored into fp; it must appear in VC. + auto IRDB = LLVMProjectIRDB::loadOrExit( + PathToLLFiles + llvm::Twine("andersen_otf_fp_c_m2r_dbg.ll")); + + const auto *MainFn = IRDB.getFunctionDefinition("main"); + ASSERT_NE(MainFn, nullptr); + + ValueCompressor Compressor; + [[maybe_unused]] auto Results = + computeAndersenOTFRaw(IRDB, {MainFn}, &Compressor); + + const auto *IdFn = IRDB.getFunctionDefinition("id"); + ASSERT_NE(IdFn, nullptr); + auto MaybeId = Compressor.getOrNull(IdFn); + EXPECT_TRUE(MaybeId.has_value()) + << "Function 'id' not in VC — address-taken functions must be inserted"; +} + +TEST(AndersenOTFAATest, ContextInsensitiveCallsMerge) { + // context_01: id(&x) and id(&y) called from two call sites. + // Context-insensitive: both call-site return values alias the same node + // (pts merges both args). A context-sensitive analysis would keep them + // separate; this test verifies the expected context-insensitive behaviour. + const TSL Arg = TSL(ArgInFun{.Idx = 0, .InFunction = "id"}); + const TSL Ret = TSL(RetVal{.InFunction = "id"}); + // Call instructions for id(&x) and id(&y) in main (lines 8 and 9). + const TSL Call1 = TSL(LineColFunOp{.Line = 8, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + const TSL Call2 = TSL(LineColFunOp{.Line = 9, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + const GTMap ExpectedResults = { + {Arg, {Arg, Ret, Call1, Call2}}, + {Ret, {Arg, Ret, Call1, Call2}}, + {Call1, {Arg, Ret, Call1, Call2}}, + {Call2, {Arg, Ret, Call1, Call2}}, + }; + doAnalysisAndCheckExact("context_01_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, SeparateFunctionsDontAlias) { + // context_02: id1 and id2 are independent identity functions called with + // different arguments. Their parameter and return-value nodes must not + // alias each other (precision check for context-insensitive analysis). + const TSL Id1Arg = TSL(ArgInFun{.Idx = 0, .InFunction = "id1"}); + const TSL Id1Ret = TSL(RetVal{.InFunction = "id1"}); + const TSL Id2Arg = TSL(ArgInFun{.Idx = 0, .InFunction = "id2"}); + const TSL Id2Ret = TSL(RetVal{.InFunction = "id2"}); + const GTMap ExpectedResults = { + {Id1Arg, {Id1Arg, Id1Ret}}, + {Id1Ret, {Id1Arg, Id1Ret}}, + {Id2Arg, {Id2Arg, Id2Ret}}, + {Id2Ret, {Id2Arg, Id2Ret}}, + }; + doAnalysisAndCheckExact("context_02_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, TransitiveCallChain) { + // context_03: id2(q) = id1(q). Alias must propagate through the chain: + // id2_arg → id1_arg → id1_ret → id2_ret. All four must alias. + const GTMap ExpectedResults = { + {TSL(ArgInFun{.Idx = 0, .InFunction = "id1"}), + {TSL(ArgInFun{.Idx = 0, .InFunction = "id1"}), + TSL(RetVal{.InFunction = "id1"}), + TSL(ArgInFun{.Idx = 0, .InFunction = "id2"}), + TSL(RetVal{.InFunction = "id2"})}}, + {TSL(RetVal{.InFunction = "id1"}), + {TSL(ArgInFun{.Idx = 0, .InFunction = "id1"}), + TSL(RetVal{.InFunction = "id1"}), + TSL(ArgInFun{.Idx = 0, .InFunction = "id2"}), + TSL(RetVal{.InFunction = "id2"})}}, + {TSL(ArgInFun{.Idx = 0, .InFunction = "id2"}), + {TSL(ArgInFun{.Idx = 0, .InFunction = "id1"}), + TSL(RetVal{.InFunction = "id1"}), + TSL(ArgInFun{.Idx = 0, .InFunction = "id2"}), + TSL(RetVal{.InFunction = "id2"})}}, + {TSL(RetVal{.InFunction = "id2"}), + {TSL(ArgInFun{.Idx = 0, .InFunction = "id1"}), + TSL(RetVal{.InFunction = "id1"}), + TSL(ArgInFun{.Idx = 0, .InFunction = "id2"}), + TSL(RetVal{.InFunction = "id2"})}}, + }; + doAnalysisAndCheckExact("context_03_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, DeepChainTwoObjectsMerge) { + // context_04_1: three-level identity chain (id3→id2→id1) called with both + // &x and &y. Context-insensitive: all params and rets of id1/id2/id3 and + // all four call sites alias each other AND with x/y (they share x_obj or + // y_obj as common pointee). x and y themselves do NOT alias each other. + const TSL Id1Arg = TSL(ArgInFun{.Idx = 0, .InFunction = "id1"}); + const TSL Id2Arg = TSL(ArgInFun{.Idx = 0, .InFunction = "id2"}); + const TSL Id3Arg = TSL(ArgInFun{.Idx = 0, .InFunction = "id3"}); + const TSL Id1Ret = TSL(RetVal{.InFunction = "id1"}); + const TSL Id2Ret = TSL(RetVal{.InFunction = "id2"}); + const TSL Id3Ret = TSL(RetVal{.InFunction = "id3"}); + const TSL XX1 = TSL(LineColFunOp{.Line = 10, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + const TSL XX2 = TSL(LineColFunOp{.Line = 11, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + const TSL YY1 = TSL(LineColFunOp{.Line = 12, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + const TSL YY2 = TSL(LineColFunOp{.Line = 13, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + // %x / %y: the alloca pointers passed to id3; recovered as arg 0 of + // respective call sites (operand 0 of a CallInst = first argument). + const TSL XAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 10, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + const TSL YAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 12, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + const std::vector Chain = {Id1Arg, Id2Arg, Id3Arg, Id1Ret, Id2Ret, + Id3Ret, XX1, XX2, YY1, YY2}; + // Chain members alias each other and both allocas (share x_obj or y_obj). + std::vector ChainWithBoth = Chain; + ChainWithBoth.push_back(XAlloca); + ChainWithBoth.push_back(YAlloca); + GTMap ExpectedResults; + for (const auto &ChainV : Chain) { + ExpectedResults[ChainV] = ChainWithBoth; + } + // x alloca aliases the chain (via x_obj) but NOT y. + std::vector XAliases = Chain; + XAliases.push_back(XAlloca); + ExpectedResults[XAlloca] = XAliases; + // y alloca aliases the chain (via y_obj) but NOT x. + std::vector YAliases = Chain; + YAliases.push_back(YAlloca); + ExpectedResults[YAlloca] = YAliases; + + // llvm::errs() << "ExpectedResults[XAlloca]: " + // << PrettyPrinter{ExpectedResults[XAlloca]} << '\n'; + // llvm::errs() << "ExpectedResults[YAlloca]: " + // << PrettyPrinter{ExpectedResults[YAlloca]} << '\n'; + + doAnalysisAndCheckExact("context_04_1_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, RecursiveSelfAlias) { + // context_08: selfRecursion(Ptr) calls itself with Ptr, forming a cycle in + // the constraint graph. SCC collapsing must merge the recursive call result + // with the formal parameter and the two call-site results in main. + const TSL Ptr = TSL(ArgInFun{.Idx = 0, .InFunction = "selfRecursion"}); + const TSL Ret = TSL(RetVal{.InFunction = "selfRecursion"}); + // int *x = selfRecursion(kptr) at line 15 + const TSL X = TSL(LineColFunOp{.Line = 15, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + // int *y = selfRecursion(kptr) at line 19 + const TSL Y = TSL(LineColFunOp{.Line = 19, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + const std::vector All = {Ptr, Ret, X, Y}; + GTMap ExpectedResults; + for (const auto &V : All) { + ExpectedResults[V] = All; + } + doAnalysisAndCheckExact("context_08_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, MutualRecursionAlias) { + // context_10_0: Forth and Back call each other with the same pointer; both + // called from main with &k. The mutual recursion forces all four + // param/ret nodes and the two call-site results to alias. + const TSL ForthPtr = TSL(ArgInFun{.Idx = 0, .InFunction = "Forth"}); + const TSL BackPtr = TSL(ArgInFun{.Idx = 0, .InFunction = "Back"}); + const TSL ForthRet = TSL(RetVal{.InFunction = "Forth"}); + const TSL BackRet = TSL(RetVal{.InFunction = "Back"}); + // int *x = Back(&k) at line 26 + const TSL X = TSL(LineColFunOp{.Line = 26, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + // int *y = Back(&k) at line 30 + const TSL Y = TSL(LineColFunOp{.Line = 30, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + const std::vector All = {ForthPtr, BackPtr, ForthRet, BackRet, X, Y}; + GTMap ExpectedResults; + for (const auto &V : All) { + ExpectedResults[V] = All; + } + doAnalysisAndCheckExact("context_10_0_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, ReturnSecondArgContextInsensitive) { + // context_12_1: argretq(p,q) returns q. Two call sites swap which + // argument is &x and which is &y. Context-insensitive: p, q, and the + // return value all receive both &x and &y, so they all alias each other. + const TSL P = TSL(ArgInFun{.Idx = 0, .InFunction = "argretq"}); + const TSL Q = TSL(ArgInFun{.Idx = 1, .InFunction = "argretq"}); + const TSL Ret = TSL(RetVal{.InFunction = "argretq"}); + // int *xx1 = argretq(&y, &x) at line 8 + const TSL XX1 = TSL(LineColFunOp{.Line = 8, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + // int *yy1 = argretq(&x, &y) at line 9 + const TSL YY1 = TSL(LineColFunOp{.Line = 9, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + const std::vector All = {P, Q, Ret, XX1, YY1}; + GTMap ExpectedResults; + for (const auto &V : All) { + ExpectedResults[V] = All; + } + doAnalysisAndCheckExact("context_12_1_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, FuncPtrCallbackIdentity) { + // context_14_1: callback(Func) returns Func — identity on function pointers. + // Two call sites pass &ret0 and &ret1 respectively. OTF must discover + // both callees. The formal parameter and return value of callback must + // alias (they point to the same set of function objects). + const TSL Func = TSL(ArgInFun{.Idx = 0, .InFunction = "callback"}); + const TSL Ret = TSL(RetVal{.InFunction = "callback"}); + const GTMap ExpectedResults = { + {Func, {Func, Ret}}, + {Ret, {Func, Ret}}, + }; + doAnalysisAndCheckExact("context_14_1_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, RecursionTwoObjectsMerge) { + // context_09_0: selfRecursion called with &k and &l. + // Context-insensitive: Ptr receives both; all four alias. + // k and l alias the chain (via their objects) but not each other. + const TSL Ptr = TSL(ArgInFun{.Idx = 0, .InFunction = "selfRecursion"}); + const TSL Ret = TSL(RetVal{.InFunction = "selfRecursion"}); + const TSL CallX = TSL(LineColFunOp{.Line = 15, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + const TSL CallY = TSL(LineColFunOp{.Line = 16, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + const TSL KAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 15, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + const TSL LAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 16, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + const std::vector Chain = {Ptr, Ret, CallX, CallY}; + GTMap ExpectedResults; + std::vector ChainAndBoth = Chain; + ChainAndBoth.push_back(KAlloca); + ChainAndBoth.push_back(LAlloca); + for (const auto &Item : Chain) { + ExpectedResults[Item] = ChainAndBoth; + } + std::vector KAliases = Chain; + KAliases.push_back(KAlloca); + ExpectedResults[KAlloca] = KAliases; + std::vector LAliases = Chain; + LAliases.push_back(LAlloca); + ExpectedResults[LAlloca] = LAliases; + doAnalysisAndCheckExact("context_09_0_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, MutualRecursionTwoObjects) { + // context_10_1: Forth↔Back mutual recursion, called with &k and &l. + // All four params/rets and four call-site results alias. + // k and l each alias all eight but not each other. + const TSL ForthPtr = TSL(ArgInFun{.Idx = 0, .InFunction = "Forth"}); + const TSL BackPtr = TSL(ArgInFun{.Idx = 0, .InFunction = "Back"}); + const TSL ForthRet = TSL(RetVal{.InFunction = "Forth"}); + const TSL BackRet = TSL(RetVal{.InFunction = "Back"}); + // xx1=Back(&k) line 27, xx2=Back(&k) line 29, yy1=Back(&l) line 31, + // yy2=Back(&l) line 33 + const auto MkCall = [](uint32_t Line) { + return TSL(LineColFunOp{.Line = Line, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + }; + const TSL XX1 = MkCall(27); + const TSL XX2 = MkCall(29); + const TSL YY1 = MkCall(31); + const TSL YY2 = MkCall(33); + const TSL KAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 27, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + const TSL LAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 31, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + const std::vector Chain = {ForthPtr, BackPtr, ForthRet, BackRet, + XX1, XX2, YY1, YY2}; + GTMap ExpectedResults; + std::vector ChainAndBoth = Chain; + ChainAndBoth.push_back(KAlloca); + ChainAndBoth.push_back(LAlloca); + for (const auto &Item : Chain) { + ExpectedResults[Item] = ChainAndBoth; + } + std::vector KAliases = Chain; + KAliases.push_back(KAlloca); + ExpectedResults[KAlloca] = KAliases; + std::vector LAliases = Chain; + LAliases.push_back(LAlloca); + ExpectedResults[LAlloca] = LAliases; + doAnalysisAndCheckExact("context_10_1_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, ThreeWayMutualRecursion) { + // context_11_0: Forth↔Back↔Stop three-way mutual recursion. + // All six params/rets and both call-site results alias. + const TSL ForthPtr = TSL(ArgInFun{.Idx = 0, .InFunction = "Forth"}); + const TSL BackPtr = TSL(ArgInFun{.Idx = 0, .InFunction = "Back"}); + const TSL StopPtr = TSL(ArgInFun{.Idx = 0, .InFunction = "Stop"}); + const TSL ForthRet = TSL(RetVal{.InFunction = "Forth"}); + const TSL BackRet = TSL(RetVal{.InFunction = "Back"}); + const TSL StopRet = TSL(RetVal{.InFunction = "Stop"}); + // x=Back(&k) line 36, y=Forth(&l) line 37 + const TSL CallX = TSL(LineColFunOp{.Line = 36, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + const TSL CallY = TSL(LineColFunOp{.Line = 37, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + const TSL KAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 36, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + const TSL LAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 37, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + const std::vector Chain = {ForthPtr, BackPtr, StopPtr, ForthRet, + BackRet, StopRet, CallX, CallY}; + GTMap ExpectedResults; + std::vector ChainAndBoth = Chain; + ChainAndBoth.push_back(KAlloca); + ChainAndBoth.push_back(LAlloca); + for (const auto &Item : Chain) { + ExpectedResults[Item] = ChainAndBoth; + } + std::vector KAliases = Chain; + KAliases.push_back(KAlloca); + ExpectedResults[KAlloca] = KAliases; + std::vector LAliases = Chain; + LAliases.push_back(LAlloca); + ExpectedResults[LAlloca] = LAliases; + doAnalysisAndCheckExact("context_11_0_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, ThreeArgReturnQContextInsensitive) { + // context_13_1: argretq(p,q,r) returns q. Two call sites pass all-x and + // all-y. Context-insensitive: all three params and the return merge. + // x and y allocas alias the group but not each other. + const TSL ArgP = TSL(ArgInFun{.Idx = 0, .InFunction = "argretq"}); + const TSL ArgQ = TSL(ArgInFun{.Idx = 1, .InFunction = "argretq"}); + const TSL ArgR = TSL(ArgInFun{.Idx = 2, .InFunction = "argretq"}); + const TSL Ret = TSL(RetVal{.InFunction = "argretq"}); + // xx1=argretq(&x,&x,&x) line 8, yy1=argretq(&y,&y,&y) line 9 + const TSL XX1 = TSL(LineColFunOp{.Line = 8, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + const TSL YY1 = TSL(LineColFunOp{.Line = 9, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + const TSL XAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 8, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + const TSL YAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 9, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + const std::vector Chain = {ArgP, ArgQ, ArgR, Ret, XX1, YY1}; + GTMap ExpectedResults; + std::vector ChainAndBoth = Chain; + ChainAndBoth.push_back(XAlloca); + ChainAndBoth.push_back(YAlloca); + for (const auto &Item : Chain) { + ExpectedResults[Item] = ChainAndBoth; + } + std::vector XAliases = Chain; + XAliases.push_back(XAlloca); + ExpectedResults[XAlloca] = XAliases; + std::vector YAliases = Chain; + YAliases.push_back(YAlloca); + ExpectedResults[YAlloca] = YAliases; + doAnalysisAndCheckExact("context_13_1_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, FuncPtrCallbackThreeWayMerge) { + // context_14_2: callback(Func) returns Func, called with &ret0, &ret1, + // &ret2. Func and Ret alias all three function values. The individual + // function values alias Func and Ret but NOT each other (disjoint pts sets). + const TSL Func = TSL(ArgInFun{.Idx = 0, .InFunction = "callback"}); + const TSL Ret = TSL(RetVal{.InFunction = "callback"}); + const TSL Ret0 = TSL(FuncByName{.FuncName = "ret0"}); + const TSL Ret1 = TSL(FuncByName{.FuncName = "ret1"}); + const TSL Ret2 = TSL(FuncByName{.FuncName = "ret2"}); + const GTMap ExpectedResults = { + {Func, {Func, Ret, Ret0, Ret1, Ret2}}, + {Ret, {Func, Ret, Ret0, Ret1, Ret2}}, + {Ret0, {Ret0, Func, Ret}}, + {Ret1, {Ret1, Func, Ret}}, + {Ret2, {Ret2, Func, Ret}}, + }; + doAnalysisAndCheckExact("context_14_2_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, FourLevelChainTwoObjects) { + // context_05_1: 4-level identity chain (id4→id3→id2→id1), called 4 times + // with &x and &y. All params/rets and call sites merge + // (context-insensitive). x and y allocas alias the chain but not each other. + const auto MkArg = [](llvm::StringRef Fn) { + return TSL(ArgInFun{.Idx = 0, .InFunction = Fn}); + }; + const auto MkRet = [](llvm::StringRef Fn) { + return TSL(RetVal{.InFunction = Fn}); + }; + const auto MkCall = [](uint32_t Line) { + return TSL(LineColFunOp{.Line = Line, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + }; + const std::vector Chain = { + MkArg("id1"), MkArg("id2"), MkArg("id3"), MkArg("id4"), + MkRet("id1"), MkRet("id2"), MkRet("id3"), MkRet("id4"), + MkCall(11), MkCall(12), MkCall(13), MkCall(14), + }; + // arg 0 of call at line 11 is &x; arg 0 of call at line 13 is &y. + const TSL XAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 11, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + const TSL YAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 13, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + GTMap ExpectedResults; + auto ChainAndBoth = Chain; + ChainAndBoth.push_back(XAlloca); + ChainAndBoth.push_back(YAlloca); + for (const auto &Item : Chain) { + ExpectedResults[Item] = ChainAndBoth; + } + auto XAliases = Chain; + XAliases.push_back(XAlloca); + ExpectedResults[XAlloca] = XAliases; + auto YAliases = Chain; + YAliases.push_back(YAlloca); + ExpectedResults[YAlloca] = YAliases; + doAnalysisAndCheckExact("context_05_1_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, FourLevelChainVariantTwoObjects) { + // context_07: foo→bar→baz→buzz 4-level identity chain, called with &x and + // &y. All params/rets and both call sites alias; x and y don't alias. + const auto MkArg = [](llvm::StringRef Fn) { + return TSL(ArgInFun{.Idx = 0, .InFunction = Fn}); + }; + const auto MkRet = [](llvm::StringRef Fn) { + return TSL(RetVal{.InFunction = Fn}); + }; + const auto MkCall = [](uint32_t Line) { + return TSL(LineColFunOp{.Line = Line, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + }; + const std::vector Chain = { + MkArg("buzz"), MkArg("baz"), MkArg("bar"), MkArg("foo"), MkRet("buzz"), + MkRet("baz"), MkRet("bar"), MkRet("foo"), MkCall(11), MkCall(12), + }; + const TSL XAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 11, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + const TSL YAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 12, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + GTMap ExpectedResults; + auto ChainAndBoth = Chain; + ChainAndBoth.push_back(XAlloca); + ChainAndBoth.push_back(YAlloca); + for (const auto &Item : Chain) { + ExpectedResults[Item] = ChainAndBoth; + } + auto XAliases = Chain; + XAliases.push_back(XAlloca); + ExpectedResults[XAlloca] = XAliases; + auto YAliases = Chain; + YAliases.push_back(YAlloca); + ExpectedResults[YAlloca] = YAliases; + doAnalysisAndCheckExact("context_07_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, RecursionFourCallSites) { + // context_09_1: selfRecursion called with &k (twice) and &l (twice). + // Context-insensitive: Ptr and Ret alias all 4 call sites. + // k and l each alias the chain but not each other. + const TSL Ptr = TSL(ArgInFun{.Idx = 0, .InFunction = "selfRecursion"}); + const TSL Ret = TSL(RetVal{.InFunction = "selfRecursion"}); + const auto MkCall = [](uint32_t Line) { + return TSL(LineColFunOp{.Line = Line, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + }; + const std::vector Chain = {Ptr, Ret, MkCall(15), + MkCall(17), MkCall(18), MkCall(20)}; + const TSL KAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 15, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + const TSL LAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 18, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + GTMap ExpectedResults; + auto ChainAndBoth = Chain; + ChainAndBoth.push_back(KAlloca); + ChainAndBoth.push_back(LAlloca); + for (const auto &Item : Chain) { + ExpectedResults[Item] = ChainAndBoth; + } + auto KAliases = Chain; + KAliases.push_back(KAlloca); + ExpectedResults[KAlloca] = KAliases; + auto LAliases = Chain; + LAliases.push_back(LAlloca); + ExpectedResults[LAlloca] = LAliases; + doAnalysisAndCheckExact("context_09_1_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, ThreeWayMutualRecursionFourCallSites) { + // context_11_1: Forth↔Back↔Stop three-way mutual recursion, called with &k + // (twice) and &l (twice). All six params/rets and all four call sites alias. + // k and l each alias the chain but not each other. + const TSL ForthPtr = TSL(ArgInFun{.Idx = 0, .InFunction = "Forth"}); + const TSL BackPtr = TSL(ArgInFun{.Idx = 0, .InFunction = "Back"}); + const TSL StopPtr = TSL(ArgInFun{.Idx = 0, .InFunction = "Stop"}); + const TSL ForthRet = TSL(RetVal{.InFunction = "Forth"}); + const TSL BackRet = TSL(RetVal{.InFunction = "Back"}); + const TSL StopRet = TSL(RetVal{.InFunction = "Stop"}); + const auto MkCall = [](uint32_t Line) { + return TSL(LineColFunOp{.Line = Line, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + }; + const std::vector Chain = {ForthPtr, BackPtr, StopPtr, ForthRet, + BackRet, StopRet, MkCall(36), MkCall(37), + MkCall(38), MkCall(39)}; + const TSL KAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 36, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + const TSL LAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 38, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + GTMap ExpectedResults; + auto ChainAndBoth = Chain; + ChainAndBoth.push_back(KAlloca); + ChainAndBoth.push_back(LAlloca); + for (const auto &Item : Chain) { + ExpectedResults[Item] = ChainAndBoth; + } + auto KAliases = Chain; + KAliases.push_back(KAlloca); + ExpectedResults[KAlloca] = KAliases; + auto LAliases = Chain; + LAliases.push_back(LAlloca); + ExpectedResults[LAlloca] = LAliases; + doAnalysisAndCheckExact("context_11_1_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, TwoArgSecondRetFourCallSites) { + // context_12_0: argretq(p,q) returns q. Four call sites mix &x and &y: + // argretq(&y,&x) twice and argretq(&x,&y) twice. + // Context-insensitive: p and q both receive {&x,&y}; all alias. + // x and y allocas each alias the group but not each other. + const TSL P = TSL(ArgInFun{.Idx = 0, .InFunction = "argretq"}); + const TSL Q = TSL(ArgInFun{.Idx = 1, .InFunction = "argretq"}); + const TSL Ret = TSL(RetVal{.InFunction = "argretq"}); + const auto MkCall = [](uint32_t Line) { + return TSL(LineColFunOp{.Line = Line, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + }; + const std::vector Chain = {P, Q, Ret, MkCall(8), + MkCall(9), MkCall(10), MkCall(11)}; + // arg 1 of call at line 8 is &x (argretq(&y, &x)); arg 0 is &y. + const TSL XAlloca = + TSL(OperandOf{.OperandIndex = 1, + .Inst = LineColFunOp{.Line = 8, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + const TSL YAlloca = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 8, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + GTMap ExpectedResults; + auto ChainAndBoth = Chain; + ChainAndBoth.push_back(XAlloca); + ChainAndBoth.push_back(YAlloca); + for (const auto &Item : Chain) { + ExpectedResults[Item] = ChainAndBoth; + } + auto XAliases = Chain; + XAliases.push_back(XAlloca); + ExpectedResults[XAlloca] = XAliases; + auto YAliases = Chain; + YAliases.push_back(YAlloca); + ExpectedResults[YAlloca] = YAliases; + doAnalysisAndCheckExact("context_12_0_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, VTableDispatch) { + // Virtual call via A* in call_get must resolve through the vtable. + // A::get() returns @x, so call_get's return must alias @x. + const TSL CallGetRet = TSL(RetVal{.InFunction = "_ZL8call_getP1A"}); + const TSL X = TSL(GlobalVar{.Name = "x"}); + const GTMap ExpectedResults = { + {CallGetRet, {CallGetRet, X}}, + {X, {X, CallGetRet}}, + }; + doAnalysisAndCheckExact("andersen_otf_vtable_cpp_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, GlobalPtrInitializer) { + // @p = global ptr @x; loading from @p must alias @x (Bug 2 soundness). + const TSL LoadQ = TSL(LineColFunOp{.Line = 7, + .Col = 12, + .InFunction = "main", + .OpCode = llvm::Instruction::Load}); + const TSL X = TSL(GlobalVar{.Name = "x"}); + const GTMap ExpectedResults = { + {LoadQ, {LoadQ, X}}, + {X, {X, LoadQ}}, + }; + doAnalysisAndCheckExact("andersen_otf_global_init_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, MergeLoadConstraint) { + // h->f->h cycle; h returns *p. + // ret(h) must alias x and y after h(&px) and h(&py) (Bug 1 soundness). + const TSL RetH = TSL(RetVal{.InFunction = "h"}); + // Operand 1 (pointer) of "int x = 0" / "int y = 0" stores — stable across + // LLVM versions (unlike the px/py initialization stores whose debug + // location moved from first-use to declaration site between LLVM 16 and 22). + const TSL VarX = + TSL(OperandOf{.OperandIndex = 1, + .Inst = LineColFunOp{.Line = 13, + .Col = 7, + .InFunction = "main", + .OpCode = llvm::Instruction::Store}}); + const TSL VarY = + TSL(OperandOf{.OperandIndex = 1, + .Inst = LineColFunOp{.Line = 14, + .Col = 7, + .InFunction = "main", + .OpCode = llvm::Instruction::Store}}); + const GTMap ExpectedResults = { + {RetH, {RetH, VarX, VarY}}, + {VarX, {RetH, VarX}}, + {VarY, {RetH, VarY}}, + }; + doAnalysisAndCheckExact("andersen_otf_merge_load_c_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, AlreadyProcessedCalleePropagation) { + // andersen_otf_fp_already_processed: main pushes D, A, B → LIFO processes + // B first (call2 deferred, pts={}), A second (call1 deferred, pts={}), + // D third (relay/get_x/get_y processed, g_fp1=relay, g_fp2=get_x set). + // checkUnresolvedFPCalls: call2 sees pts={get_x}, call1 connects already- + // processed relay with get_y → g_fp2 gains get_y — but call2 already ran. + // The outer loop must re-check so ret(B) aliases both &x and &y. + const TSL RetB = TSL(RetVal{.InFunction = "B"}); + const TSL X = TSL(GlobalVar{.Name = "x"}); + const TSL Y = TSL(GlobalVar{.Name = "y"}); + const GTMap ExpectedResults = { + {RetB, {RetB, X, Y}}, + {X, {X, RetB}}, + {Y, {Y, RetB}}, + }; + doAnalysisAndCheckExact("andersen_otf_fp_already_processed_c_dbg.ll", + ExpectedResults); +} + +TEST(AndersenOTFAATest, VTableDispatchPrecision) { + // B has two virtual methods: getX (slot 0) returns @x, getY (slot 1) + // returns @y. Per-slot dispatch must keep the two return values separate. + const TSL RetGetX = TSL(RetVal{.InFunction = "_ZL9call_getXP1B"}); + const TSL RetGetY = TSL(RetVal{.InFunction = "_ZL9call_getYP1B"}); + const TSL X = TSL(GlobalVar{.Name = "x"}); + const TSL Y = TSL(GlobalVar{.Name = "y"}); + const GTMap ExpectedResults = { + {RetGetX, {RetGetX, X}}, + {X, {X, RetGetX}}, + {RetGetY, {RetGetY, Y}}, + {Y, {Y, RetGetY}}, + }; + doAnalysisAndCheckExact("andersen_otf_vtable2_cpp_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, SoundnessFnPtrToExternalDecl) { + // andersen_otf_extern_callback: main passes @close_stdout to the + // declaration-only register_callback. close_stdout calls flush_impl. + // + // Soundy: both must appear as CG vertices (entry-point promotion). + // Unsound: neither must appear (no processing of external callbacks). + auto IRDB = LLVMProjectIRDB::loadOrExit( + PathToLLFiles + "andersen_otf_extern_callback_c_dbg.ll"); + + const auto *CloseStdout = IRDB.getFunctionDefinition("close_stdout"); + const auto *FlushImpl = IRDB.getFunctionDefinition("flush_impl"); + const auto *MainFn = IRDB.getFunctionDefinition("main"); + ASSERT_NE(MainFn, nullptr); + ASSERT_NE(CloseStdout, nullptr); + ASSERT_NE(FlushImpl, nullptr); + + auto HasCGVertex = [](const LLVMBasedCallGraph &Graph, + const llvm::Function *Fun) { + return llvm::is_contained(Graph.getAllVertexFunctions(), Fun); + }; + + { + auto Res = + computeAndersenOTFRaw(IRDB, {MainFn}, nullptr, Soundness::Soundy); + EXPECT_TRUE(HasCGVertex(Res.CG, CloseStdout)) + << "close_stdout must be a CG vertex at Soundy"; + EXPECT_TRUE(HasCGVertex(Res.CG, FlushImpl)) + << "flush_impl must be a CG vertex at Soundy"; + } + + { + auto Res = + computeAndersenOTFRaw(IRDB, {MainFn}, nullptr, Soundness::Unsound); + EXPECT_FALSE(HasCGVertex(Res.CG, CloseStdout)) + << "close_stdout must not be a CG vertex at Unsound"; + EXPECT_FALSE(HasCGVertex(Res.CG, FlushImpl)) + << "flush_impl must not be a CG vertex at Unsound"; + } +} + +TEST(AndersenOTFAATest, LibCSummaryStrcpyReturnAliasesDst) { + // strcpy(buf, "hello") summary: param 0 (dst) -> ReturnValue. + // The call result must alias buf (arg 0); they share the same buffer object. + // This exercises the ReturnValue branch of applyLibrarySummary(). + const TSL Call = TSL(LineColFunOp{.Line = 9, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}); + const TSL Buf = + TSL(OperandOf{.OperandIndex = 0, + .Inst = LineColFunOp{.Line = 9, + .Col = 0, + .InFunction = "main", + .OpCode = llvm::Instruction::Call}}); + const GTMap ExpectedResults = { + {Call, {Call, Buf}}, + {Buf, {Buf, Call}}, + }; + doAnalysisAndCheckExact("andersen_otf_libc_c_m2r_dbg.ll", ExpectedResults); +} + +TEST(AndersenOTFAATest, FnPtrStoredInStructField) { + // Function pointer stored into a struct field by an initializer, then + // retrieved and called indirectly. The indirect call in do_call() must + // have target() as a callee. + auto IRDB = LLVMProjectIRDB::loadOrExit( + PathToLLFiles + "andersen_otf_fp_struct_field_c_dbg.ll"); + + const auto *DoCall = IRDB.getFunctionDefinition("do_call"); + const auto *Target = IRDB.getFunctionDefinition("target"); + const auto *MainFn = IRDB.getFunctionDefinition("main"); + ASSERT_NE(MainFn, nullptr); + ASSERT_NE(DoCall, nullptr); + ASSERT_NE(Target, nullptr); + + auto Res = computeAndersenOTFRaw(IRDB, {MainFn}); + + // Find the indirect call instruction in do_call. + const llvm::CallBase *IndirectCS = nullptr; + for (const auto &I : llvm::instructions(DoCall)) { + const auto *CS = llvm::dyn_cast(&I); + if (!CS || CS->isDebugOrPseudoInst()) { + continue; + } + if (!llvm::isa( + CS->getCalledOperand()->stripPointerCastsAndAliases())) { + IndirectCS = CS; + break; + } + } + ASSERT_NE(IndirectCS, nullptr) << "No indirect call found in do_call"; + + const auto &Callees = Res.CG.getCalleesOfCallAt(IndirectCS); + EXPECT_TRUE(llvm::is_contained(Callees, Target)) + << "target() must be a callee of the indirect call in do_call()"; +} + +TEST(AndersenOTFAATest, StructVtableDispatch) { + // Hand-rolled C vtable: const struct Ops { read, write }. + // ops->write(...) must resolve to myWrite only, not myRead. + // Without the struct-vtable path, field-insensitive analysis adds both. + auto IRDB = LLVMProjectIRDB::loadOrExit( + PathToLLFiles + "andersen_otf_struct_vtable_c_m2r_dbg.ll"); + + const auto *DispatchFn = IRDB.getFunctionDefinition("dispatch"); + const auto *MyRead = IRDB.getFunctionDefinition("myRead"); + const auto *MyWrite = IRDB.getFunctionDefinition("myWrite"); + const auto *MainFn = IRDB.getFunctionDefinition("main"); + ASSERT_NE(MainFn, nullptr); + ASSERT_NE(DispatchFn, nullptr); + ASSERT_NE(MyRead, nullptr); + ASSERT_NE(MyWrite, nullptr); + + auto Res = computeAndersenOTFRaw(IRDB, {MainFn}); + + const llvm::CallBase *IndirectCS = nullptr; + for (const auto &I : llvm::instructions(DispatchFn)) { + const auto *CS = llvm::dyn_cast(&I); + if (!CS || CS->isDebugOrPseudoInst()) { + continue; + } + if (!llvm::isa( + CS->getCalledOperand()->stripPointerCastsAndAliases())) { + IndirectCS = CS; + break; + } + } + ASSERT_NE(IndirectCS, nullptr) << "No indirect call found in dispatch()"; + + const auto &Callees = Res.CG.getCalleesOfCallAt(IndirectCS); + EXPECT_TRUE(llvm::is_contained(Callees, MyWrite)) + << "myWrite must be a callee of ops->write(...)"; + EXPECT_FALSE(llvm::is_contained(Callees, MyRead)) + << "myRead must not be a callee of ops->write(...) (field 1, not 0)"; +} + +} // namespace + +int main(int Argc, char **Argv) { + ::testing::InitGoogleTest(&Argc, Argv); + return RUN_ALL_TESTS(); +} diff --git a/unittests/PhasarLLVM/Pointer/CMakeLists.txt b/unittests/PhasarLLVM/Pointer/CMakeLists.txt index 7a8857df82..7085c599ae 100644 --- a/unittests/PhasarLLVM/Pointer/CMakeLists.txt +++ b/unittests/PhasarLLVM/Pointer/CMakeLists.txt @@ -1,4 +1,5 @@ set(PointerFlowSources + AndersenOTFAATest.cpp LLVMAliasSetTest.cpp LLVMAliasSetSerializationTest.cpp FilteredLLVMAliasSetTest.cpp