| //===- BalancedPartitioning.cpp -------------------------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements BalancedPartitioning, a recursive balanced graph |
| // partitioning algorithm. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "llvm/Support/BalancedPartitioning.h" |
| #include "llvm/ADT/SetVector.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/Format.h" |
| #include "llvm/Support/FormatVariadic.h" |
| #include "llvm/Support/ThreadPool.h" |
| |
| using namespace llvm; |
| #define DEBUG_TYPE "balanced-partitioning" |
| |
| void BPFunctionNode::dump(raw_ostream &OS) const { |
| OS << formatv("{{ID={0} Utilities={{{1:$[,]}} Bucket={2}}", Id, |
| make_range(UtilityNodes.begin(), UtilityNodes.end()), Bucket); |
| } |
| |
| template <typename Func> |
| void BalancedPartitioning::BPThreadPool::async(Func &&F) { |
| #if LLVM_ENABLE_THREADS |
| // This new thread could spawn more threads, so mark it as active |
| ++NumActiveThreads; |
| TheThreadPool.async([=]() { |
| // Run the task |
| F(); |
| |
| // This thread will no longer spawn new threads, so mark it as inactive |
| if (--NumActiveThreads == 0) { |
| // There are no more active threads, so mark as finished and notify |
| { |
| std::unique_lock<std::mutex> lock(mtx); |
| assert(!IsFinishedSpawning); |
| IsFinishedSpawning = true; |
| } |
| cv.notify_one(); |
| } |
| }); |
| #else |
| llvm_unreachable("threads are disabled"); |
| #endif |
| } |
| |
| void BalancedPartitioning::BPThreadPool::wait() { |
| #if LLVM_ENABLE_THREADS |
| // TODO: We could remove the mutex and condition variable and use |
| // std::atomic::wait() instead, but that isn't available until C++20 |
| { |
| std::unique_lock<std::mutex> lock(mtx); |
| cv.wait(lock, [&]() { return IsFinishedSpawning; }); |
| assert(IsFinishedSpawning && NumActiveThreads == 0); |
| } |
| // Now we can call ThreadPool::wait() since all tasks have been submitted |
| TheThreadPool.wait(); |
| #else |
| llvm_unreachable("threads are disabled"); |
| #endif |
| } |
| |
| BalancedPartitioning::BalancedPartitioning( |
| const BalancedPartitioningConfig &Config) |
| : Config(Config) { |
| // Pre-computing log2 values |
| Log2Cache[0] = 0.0; |
| for (unsigned I = 1; I < LOG_CACHE_SIZE; I++) |
| Log2Cache[I] = std::log2(I); |
| } |
| |
| void BalancedPartitioning::run(std::vector<BPFunctionNode> &Nodes) const { |
| LLVM_DEBUG( |
| dbgs() << format( |
| "Partitioning %d nodes using depth %d and %d iterations per split\n", |
| Nodes.size(), Config.SplitDepth, Config.IterationsPerSplit)); |
| std::optional<BPThreadPool> TP; |
| #if LLVM_ENABLE_THREADS |
| ThreadPool TheThreadPool; |
| if (Config.TaskSplitDepth > 1) |
| TP.emplace(TheThreadPool); |
| #endif |
| |
| // Record the input order |
| for (unsigned I = 0; I < Nodes.size(); I++) |
| Nodes[I].InputOrderIndex = I; |
| |
| auto NodesRange = llvm::make_range(Nodes.begin(), Nodes.end()); |
| auto BisectTask = [=, &TP]() { |
| bisect(NodesRange, /*RecDepth=*/0, /*RootBucket=*/1, /*Offset=*/0, TP); |
| }; |
| if (TP) { |
| TP->async(std::move(BisectTask)); |
| TP->wait(); |
| } else { |
| BisectTask(); |
| } |
| |
| llvm::stable_sort(NodesRange, [](const auto &L, const auto &R) { |
| return L.Bucket < R.Bucket; |
| }); |
| |
| LLVM_DEBUG(dbgs() << "Balanced partitioning completed\n"); |
| } |
| |
| void BalancedPartitioning::bisect(const FunctionNodeRange Nodes, |
| unsigned RecDepth, unsigned RootBucket, |
| unsigned Offset, |
| std::optional<BPThreadPool> &TP) const { |
| unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end()); |
| if (NumNodes <= 1 || RecDepth >= Config.SplitDepth) { |
| // We've reach the lowest level of the recursion tree. Fall back to the |
| // original order and assign to buckets. |
| llvm::stable_sort(Nodes, [](const auto &L, const auto &R) { |
| return L.InputOrderIndex < R.InputOrderIndex; |
| }); |
| for (auto &N : Nodes) |
| N.Bucket = Offset++; |
| return; |
| } |
| |
| LLVM_DEBUG(dbgs() << format("Bisect with %d nodes and root bucket %d\n", |
| NumNodes, RootBucket)); |
| |
| std::mt19937 RNG(RootBucket); |
| |
| unsigned LeftBucket = 2 * RootBucket; |
| unsigned RightBucket = 2 * RootBucket + 1; |
| |
| // Split into two and assign to the left and right buckets |
| split(Nodes, LeftBucket); |
| |
| runIterations(Nodes, RecDepth, LeftBucket, RightBucket, RNG); |
| |
| // Split nodes wrt the resulting buckets |
| auto NodesMid = |
| llvm::partition(Nodes, [&](auto &N) { return N.Bucket == LeftBucket; }); |
| unsigned MidOffset = Offset + std::distance(Nodes.begin(), NodesMid); |
| |
| auto LeftNodes = llvm::make_range(Nodes.begin(), NodesMid); |
| auto RightNodes = llvm::make_range(NodesMid, Nodes.end()); |
| |
| auto LeftRecTask = [=, &TP]() { |
| bisect(LeftNodes, RecDepth + 1, LeftBucket, Offset, TP); |
| }; |
| auto RightRecTask = [=, &TP]() { |
| bisect(RightNodes, RecDepth + 1, RightBucket, MidOffset, TP); |
| }; |
| |
| if (TP && RecDepth < Config.TaskSplitDepth && NumNodes >= 4) { |
| TP->async(std::move(LeftRecTask)); |
| TP->async(std::move(RightRecTask)); |
| } else { |
| LeftRecTask(); |
| RightRecTask(); |
| } |
| } |
| |
| void BalancedPartitioning::runIterations(const FunctionNodeRange Nodes, |
| unsigned RecDepth, unsigned LeftBucket, |
| unsigned RightBucket, |
| std::mt19937 &RNG) const { |
| unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end()); |
| DenseMap<BPFunctionNode::UtilityNodeT, unsigned> UtilityNodeDegree; |
| for (auto &N : Nodes) |
| for (auto &UN : N.UtilityNodes) |
| ++UtilityNodeDegree[UN]; |
| // Remove utility nodes if they have just one edge or are connected to all |
| // functions |
| for (auto &N : Nodes) |
| llvm::erase_if(N.UtilityNodes, [&](auto &UN) { |
| return UtilityNodeDegree[UN] <= 1 || UtilityNodeDegree[UN] >= NumNodes; |
| }); |
| |
| // Renumber utility nodes so they can be used to index into Signatures |
| DenseMap<BPFunctionNode::UtilityNodeT, unsigned> UtilityNodeIndex; |
| for (auto &N : Nodes) |
| for (auto &UN : N.UtilityNodes) |
| if (!UtilityNodeIndex.count(UN)) |
| UtilityNodeIndex[UN] = UtilityNodeIndex.size(); |
| for (auto &N : Nodes) |
| for (auto &UN : N.UtilityNodes) |
| UN = UtilityNodeIndex[UN]; |
| |
| // Initialize signatures |
| SignaturesT Signatures(/*Size=*/UtilityNodeIndex.size()); |
| for (auto &N : Nodes) { |
| for (auto &UN : N.UtilityNodes) { |
| assert(UN < Signatures.size()); |
| if (N.Bucket == LeftBucket) { |
| Signatures[UN].LeftCount++; |
| } else { |
| Signatures[UN].RightCount++; |
| } |
| } |
| } |
| |
| for (unsigned I = 0; I < Config.IterationsPerSplit; I++) { |
| unsigned NumMovedNodes = |
| runIteration(Nodes, LeftBucket, RightBucket, Signatures, RNG); |
| if (NumMovedNodes == 0) |
| break; |
| } |
| } |
| |
| unsigned BalancedPartitioning::runIteration(const FunctionNodeRange Nodes, |
| unsigned LeftBucket, |
| unsigned RightBucket, |
| SignaturesT &Signatures, |
| std::mt19937 &RNG) const { |
| // Init signature cost caches |
| for (auto &Signature : Signatures) { |
| if (Signature.CachedGainIsValid) |
| continue; |
| unsigned L = Signature.LeftCount; |
| unsigned R = Signature.RightCount; |
| assert((L > 0 || R > 0) && "incorrect signature"); |
| float Cost = logCost(L, R); |
| Signature.CachedGainLR = 0.f; |
| Signature.CachedGainRL = 0.f; |
| if (L > 0) |
| Signature.CachedGainLR = Cost - logCost(L - 1, R + 1); |
| if (R > 0) |
| Signature.CachedGainRL = Cost - logCost(L + 1, R - 1); |
| Signature.CachedGainIsValid = true; |
| } |
| |
| // Compute move gains |
| typedef std::pair<float, BPFunctionNode *> GainPair; |
| std::vector<GainPair> Gains; |
| for (auto &N : Nodes) { |
| bool FromLeftToRight = (N.Bucket == LeftBucket); |
| float Gain = moveGain(N, FromLeftToRight, Signatures); |
| Gains.push_back(std::make_pair(Gain, &N)); |
| } |
| |
| // Collect left and right gains |
| auto LeftEnd = llvm::partition( |
| Gains, [&](const auto &GP) { return GP.second->Bucket == LeftBucket; }); |
| auto LeftRange = llvm::make_range(Gains.begin(), LeftEnd); |
| auto RightRange = llvm::make_range(LeftEnd, Gains.end()); |
| |
| // Sort gains in descending order |
| auto LargerGain = [](const auto &L, const auto &R) { |
| return L.first > R.first; |
| }; |
| llvm::stable_sort(LeftRange, LargerGain); |
| llvm::stable_sort(RightRange, LargerGain); |
| |
| unsigned NumMovedDataVertices = 0; |
| for (auto [LeftPair, RightPair] : llvm::zip(LeftRange, RightRange)) { |
| auto &[LeftGain, LeftNode] = LeftPair; |
| auto &[RightGain, RightNode] = RightPair; |
| // Stop when the gain is no longer beneficial |
| if (LeftGain + RightGain <= 0.f) |
| break; |
| // Try to exchange the nodes between buckets |
| if (moveFunctionNode(*LeftNode, LeftBucket, RightBucket, Signatures, RNG)) |
| ++NumMovedDataVertices; |
| if (moveFunctionNode(*RightNode, LeftBucket, RightBucket, Signatures, RNG)) |
| ++NumMovedDataVertices; |
| } |
| return NumMovedDataVertices; |
| } |
| |
| bool BalancedPartitioning::moveFunctionNode(BPFunctionNode &N, |
| unsigned LeftBucket, |
| unsigned RightBucket, |
| SignaturesT &Signatures, |
| std::mt19937 &RNG) const { |
| // Sometimes we skip the move. This helps to escape local optima |
| if (std::uniform_real_distribution<float>(0.f, 1.f)(RNG) <= |
| Config.SkipProbability) |
| return false; |
| |
| bool FromLeftToRight = (N.Bucket == LeftBucket); |
| // Update the current bucket |
| N.Bucket = (FromLeftToRight ? RightBucket : LeftBucket); |
| |
| // Update signatures and invalidate gain cache |
| if (FromLeftToRight) { |
| for (auto &UN : N.UtilityNodes) { |
| auto &Signature = Signatures[UN]; |
| Signature.LeftCount--; |
| Signature.RightCount++; |
| Signature.CachedGainIsValid = false; |
| } |
| } else { |
| for (auto &UN : N.UtilityNodes) { |
| auto &Signature = Signatures[UN]; |
| Signature.LeftCount++; |
| Signature.RightCount--; |
| Signature.CachedGainIsValid = false; |
| } |
| } |
| return true; |
| } |
| |
| void BalancedPartitioning::split(const FunctionNodeRange Nodes, |
| unsigned StartBucket) const { |
| unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end()); |
| auto NodesMid = Nodes.begin() + (NumNodes + 1) / 2; |
| |
| std::nth_element(Nodes.begin(), NodesMid, Nodes.end(), [](auto &L, auto &R) { |
| return L.InputOrderIndex < R.InputOrderIndex; |
| }); |
| |
| for (auto &N : llvm::make_range(Nodes.begin(), NodesMid)) |
| N.Bucket = StartBucket; |
| for (auto &N : llvm::make_range(NodesMid, Nodes.end())) |
| N.Bucket = StartBucket + 1; |
| } |
| |
| float BalancedPartitioning::moveGain(const BPFunctionNode &N, |
| bool FromLeftToRight, |
| const SignaturesT &Signatures) { |
| float Gain = 0.f; |
| for (auto &UN : N.UtilityNodes) |
| Gain += (FromLeftToRight ? Signatures[UN].CachedGainLR |
| : Signatures[UN].CachedGainRL); |
| return Gain; |
| } |
| |
| float BalancedPartitioning::logCost(unsigned X, unsigned Y) const { |
| return -(X * log2Cached(X + 1) + Y * log2Cached(Y + 1)); |
| } |
| |
| float BalancedPartitioning::log2Cached(unsigned i) const { |
| return (i < LOG_CACHE_SIZE) ? Log2Cache[i] : std::log2(i); |
| } |