src/llvm-project/llvm/include/llvm/Support/BalancedPartitioning.h - toolchain/rustc - Git at Google

 //===- BalancedPartitioning.h ---------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements BalancedPartitioning, a recursive balanced graph
 // partitioning algorithm.
 //
 // The algorithm is used to find an ordering of FunctionNodes while optimizing
 // a specified objective. The algorithm uses recursive bisection; it starts
 // with a collection of unordered FunctionNodes and tries to split them into
 // two sets (buckets) of equal cardinality. Each bisection step is comprised of
 // iterations that greedily swap the FunctionNodes between the two buckets while
 // there is an improvement of the objective. Once the process converges, the
 // problem is divided into two sub-problems of half the size, which are
 // recursively applied for the two buckets. The final ordering of the
 // FunctionNodes is obtained by concatenating the two (recursively computed)
 // orderings.
 //
 // In order to speed up the computation, we limit the depth of the recursive
 // tree by a specified constant (SplitDepth) and apply at most a constant
 // number of greedy iterations per split (IterationsPerSplit). The worst-case
 // time complexity of the implementation is bounded by O(M*log^2 N), where
 // N is the number of FunctionNodes and M is the number of
 // FunctionNode-UtilityNode edges; (assuming that any collection of D
 // FunctionNodes contains O(D) UtilityNodes). Notice that the two different
 // recursive sub-problems are independent and thus can be efficiently processed
 // in parallel.
 //
 // Reference:
 //   * Optimizing Function Layout for Mobile Applications,
 //     https://arxiv.org/abs/2211.09285
 //
 //===----------------------------------------------------------------------===//

 #ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H
 #define LLVM_SUPPORT_BALANCED_PARTITIONING_H

 #include "raw_ostream.h"
 #include "llvm/ADT/ArrayRef.h"

 #include <atomic>
 #include <condition_variable>
 #include <mutex>
 #include <random>
 #include <vector>

 namespace llvm {

 class ThreadPool;
 /// A function with a set of utility nodes where it is beneficial to order two
 /// functions close together if they have similar utility nodes
 class BPFunctionNode {
   friend class BalancedPartitioning;

 public:
   using IDT = uint64_t;
   using UtilityNodeT = uint32_t;

   /// \param UtilityNodes the set of utility nodes (must be unique'd)
   BPFunctionNode(IDT Id, ArrayRef<UtilityNodeT> UtilityNodes)
       : Id(Id), UtilityNodes(UtilityNodes) {}

   /// The ID of this node
   IDT Id;

   void dump(raw_ostream &OS) const;

 protected:
   /// The list of utility nodes associated with this node
   SmallVector<UtilityNodeT, 4> UtilityNodes;
   /// The bucket assigned by balanced partitioning
   std::optional<unsigned> Bucket;
   /// The index of the input order of the FunctionNodes
   uint64_t InputOrderIndex = 0;

   friend class BPFunctionNodeTest_Basic_Test;
   friend class BalancedPartitioningTest_Basic_Test;
   friend class BalancedPartitioningTest_Large_Test;
 };

 /// Algorithm parameters; default values are tuned on real-world binaries
 struct BalancedPartitioningConfig {
   /// The depth of the recursive bisection
   unsigned SplitDepth = 18;
   /// The maximum number of bp iterations per split
   unsigned IterationsPerSplit = 40;
   /// The probability for a vertex to skip a move from its current bucket to
   /// another bucket; it often helps to escape from a local optima
   float SkipProbability = 0.1f;
   /// Recursive subtasks up to the given depth are added to the queue and
   /// distributed among threads by ThreadPool; all subsequent calls are executed
   /// on the same thread
   unsigned TaskSplitDepth = 9;
 };

 class BalancedPartitioning {
 public:
   BalancedPartitioning(const BalancedPartitioningConfig &Config);

   /// Run recursive graph partitioning that optimizes a given objective.
   void run(std::vector<BPFunctionNode> &Nodes) const;

 private:
   struct UtilitySignature;
   using SignaturesT = SmallVector<UtilitySignature, 4>;
   using FunctionNodeRange =
       iterator_range<std::vector<BPFunctionNode>::iterator>;

   /// A special ThreadPool that allows for spawning new tasks after blocking on
   /// wait(). BalancedPartitioning recursively spawns new threads inside other
   /// threads, so we need to track how many active threads that could spawn more
   /// threads.
   struct BPThreadPool {
     ThreadPool &TheThreadPool;
     std::mutex mtx;
     std::condition_variable cv;
     /// The number of threads that could spawn more threads
     std::atomic<int> NumActiveThreads = 0;
     /// Only true when all threads are down spawning new threads
     bool IsFinishedSpawning = false;
     /// Asynchronous submission of the task to the pool
     template <typename Func> void async(Func &&F);
     /// Blocking wait for all threads to complete. Unlike ThreadPool, it is
     /// acceptable for other threads to add more tasks while blocking on this
     /// call.
     void wait();
     BPThreadPool(ThreadPool &TheThreadPool) : TheThreadPool(TheThreadPool) {}
   };

   /// Run a recursive bisection of a given list of FunctionNodes
   /// \param RecDepth the current depth of recursion
   /// \param RootBucket the initial bucket of the dataVertices
   /// \param Offset the assigned buckets are the range [Offset, Offset +
   /// Nodes.size()]
   void bisect(const FunctionNodeRange Nodes, unsigned RecDepth,
               unsigned RootBucket, unsigned Offset,
               std::optional<BPThreadPool> &TP) const;

   /// Run bisection iterations
   void runIterations(const FunctionNodeRange Nodes, unsigned RecDepth,
                      unsigned LeftBucket, unsigned RightBucket,
                      std::mt19937 &RNG) const;

   /// Run a bisection iteration to improve the optimization goal
   /// \returns the total number of moved FunctionNodes
   unsigned runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket,
                         unsigned RightBucket, SignaturesT &Signatures,
                         std::mt19937 &RNG) const;

   /// Try to move \p N from one bucket to another
   /// \returns true iff \p N is moved
   bool moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket,
                         unsigned RightBucket, SignaturesT &Signatures,
                         std::mt19937 &RNG) const;

   /// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket +
   /// 1 The method is used for an initial assignment before a bisection step
   void split(const FunctionNodeRange Nodes, unsigned StartBucket) const;

   /// The cost of the uniform log-gap cost, assuming a utility node has \p X
   /// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one.
   float logCost(unsigned X, unsigned Y) const;

   float log2Cached(unsigned i) const;

   const BalancedPartitioningConfig &Config;

   /// Precomputed values of log2(x). Table size is small enough to fit in cache.
   static constexpr unsigned LOG_CACHE_SIZE = 16384;
   float Log2Cache[LOG_CACHE_SIZE];

   /// The signature of a particular utility node used for the bisection step,
   /// i.e., the number of \p FunctionNodes in each of the two buckets
   struct UtilitySignature {
     /// The number of \p FunctionNodes in the left bucket
     unsigned LeftCount = 0;
     /// The number of \p FunctionNodes in the right bucket
     unsigned RightCount = 0;
     /// The cached gain of moving a \p FunctionNode from the left bucket to the
     /// right bucket
     float CachedGainLR;
     /// The cached gain of moving a \p FunctionNode from the right bucket to the
     /// left bucket
     float CachedGainRL;
     /// Whether \p CachedGainLR and \p CachedGainRL are valid
     bool CachedGainIsValid = false;
   };

 protected:
   /// Compute the move gain for uniform log-gap cost
   static float moveGain(const BPFunctionNode &N, bool FromLeftToRight,
                         const SignaturesT &Signatures);
   friend class BalancedPartitioningTest_MoveGain_Test;
 };

 } // end namespace llvm

 #endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H
	//===- BalancedPartitioning.h ---------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements BalancedPartitioning, a recursive balanced graph
	// partitioning algorithm.
	//
	// The algorithm is used to find an ordering of FunctionNodes while optimizing
	// a specified objective. The algorithm uses recursive bisection; it starts
	// with a collection of unordered FunctionNodes and tries to split them into
	// two sets (buckets) of equal cardinality. Each bisection step is comprised of
	// iterations that greedily swap the FunctionNodes between the two buckets while
	// there is an improvement of the objective. Once the process converges, the
	// problem is divided into two sub-problems of half the size, which are
	// recursively applied for the two buckets. The final ordering of the
	// FunctionNodes is obtained by concatenating the two (recursively computed)
	// orderings.
	//
	// In order to speed up the computation, we limit the depth of the recursive
	// tree by a specified constant (SplitDepth) and apply at most a constant
	// number of greedy iterations per split (IterationsPerSplit). The worst-case
	// time complexity of the implementation is bounded by O(M*log^2 N), where
	// N is the number of FunctionNodes and M is the number of
	// FunctionNode-UtilityNode edges; (assuming that any collection of D
	// FunctionNodes contains O(D) UtilityNodes). Notice that the two different
	// recursive sub-problems are independent and thus can be efficiently processed
	// in parallel.
	//
	// Reference:
	// * Optimizing Function Layout for Mobile Applications,
	// https://arxiv.org/abs/2211.09285
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H
	#define LLVM_SUPPORT_BALANCED_PARTITIONING_H

	#include "raw_ostream.h"
	#include "llvm/ADT/ArrayRef.h"

	#include <atomic>
	#include <condition_variable>
	#include <mutex>
	#include <random>
	#include <vector>

	namespace llvm {

	class ThreadPool;
	/// A function with a set of utility nodes where it is beneficial to order two
	/// functions close together if they have similar utility nodes
	class BPFunctionNode {
	friend class BalancedPartitioning;

	public:
	using IDT = uint64_t;
	using UtilityNodeT = uint32_t;

	/// \param UtilityNodes the set of utility nodes (must be unique'd)
	BPFunctionNode(IDT Id, ArrayRef<UtilityNodeT> UtilityNodes)
	: Id(Id), UtilityNodes(UtilityNodes) {}

	/// The ID of this node
	IDT Id;

	void dump(raw_ostream &OS) const;

	protected:
	/// The list of utility nodes associated with this node
	SmallVector<UtilityNodeT, 4> UtilityNodes;
	/// The bucket assigned by balanced partitioning
	std::optional<unsigned> Bucket;
	/// The index of the input order of the FunctionNodes
	uint64_t InputOrderIndex = 0;

	friend class BPFunctionNodeTest_Basic_Test;
	friend class BalancedPartitioningTest_Basic_Test;
	friend class BalancedPartitioningTest_Large_Test;
	};

	/// Algorithm parameters; default values are tuned on real-world binaries
	struct BalancedPartitioningConfig {
	/// The depth of the recursive bisection
	unsigned SplitDepth = 18;
	/// The maximum number of bp iterations per split
	unsigned IterationsPerSplit = 40;
	/// The probability for a vertex to skip a move from its current bucket to
	/// another bucket; it often helps to escape from a local optima
	float SkipProbability = 0.1f;
	/// Recursive subtasks up to the given depth are added to the queue and
	/// distributed among threads by ThreadPool; all subsequent calls are executed
	/// on the same thread
	unsigned TaskSplitDepth = 9;
	};

	class BalancedPartitioning {
	public:
	BalancedPartitioning(const BalancedPartitioningConfig &Config);

	/// Run recursive graph partitioning that optimizes a given objective.
	void run(std::vector<BPFunctionNode> &Nodes) const;

	private:
	struct UtilitySignature;
	using SignaturesT = SmallVector<UtilitySignature, 4>;
	using FunctionNodeRange =
	iterator_range<std::vector<BPFunctionNode>::iterator>;

	/// A special ThreadPool that allows for spawning new tasks after blocking on
	/// wait(). BalancedPartitioning recursively spawns new threads inside other
	/// threads, so we need to track how many active threads that could spawn more
	/// threads.
	struct BPThreadPool {
	ThreadPool &TheThreadPool;
	std::mutex mtx;
	std::condition_variable cv;
	/// The number of threads that could spawn more threads
	std::atomic<int> NumActiveThreads = 0;
	/// Only true when all threads are down spawning new threads
	bool IsFinishedSpawning = false;
	/// Asynchronous submission of the task to the pool
	template <typename Func> void async(Func &&F);
	/// Blocking wait for all threads to complete. Unlike ThreadPool, it is
	/// acceptable for other threads to add more tasks while blocking on this
	/// call.
	void wait();
	BPThreadPool(ThreadPool &TheThreadPool) : TheThreadPool(TheThreadPool) {}
	};

	/// Run a recursive bisection of a given list of FunctionNodes
	/// \param RecDepth the current depth of recursion
	/// \param RootBucket the initial bucket of the dataVertices
	/// \param Offset the assigned buckets are the range [Offset, Offset +
	/// Nodes.size()]
	void bisect(const FunctionNodeRange Nodes, unsigned RecDepth,
	unsigned RootBucket, unsigned Offset,
	std::optional<BPThreadPool> &TP) const;

	/// Run bisection iterations
	void runIterations(const FunctionNodeRange Nodes, unsigned RecDepth,
	unsigned LeftBucket, unsigned RightBucket,
	std::mt19937 &RNG) const;

	/// Run a bisection iteration to improve the optimization goal
	/// \returns the total number of moved FunctionNodes
	unsigned runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket,
	unsigned RightBucket, SignaturesT &Signatures,
	std::mt19937 &RNG) const;

	/// Try to move \p N from one bucket to another
	/// \returns true iff \p N is moved
	bool moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket,
	unsigned RightBucket, SignaturesT &Signatures,
	std::mt19937 &RNG) const;

	/// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket +
	/// 1 The method is used for an initial assignment before a bisection step
	void split(const FunctionNodeRange Nodes, unsigned StartBucket) const;

	/// The cost of the uniform log-gap cost, assuming a utility node has \p X
	/// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one.
	float logCost(unsigned X, unsigned Y) const;

	float log2Cached(unsigned i) const;

	const BalancedPartitioningConfig &Config;

	/// Precomputed values of log2(x). Table size is small enough to fit in cache.
	static constexpr unsigned LOG_CACHE_SIZE = 16384;
	float Log2Cache[LOG_CACHE_SIZE];

	/// The signature of a particular utility node used for the bisection step,
	/// i.e., the number of \p FunctionNodes in each of the two buckets
	struct UtilitySignature {
	/// The number of \p FunctionNodes in the left bucket
	unsigned LeftCount = 0;
	/// The number of \p FunctionNodes in the right bucket
	unsigned RightCount = 0;
	/// The cached gain of moving a \p FunctionNode from the left bucket to the
	/// right bucket
	float CachedGainLR;
	/// The cached gain of moving a \p FunctionNode from the right bucket to the
	/// left bucket
	float CachedGainRL;
	/// Whether \p CachedGainLR and \p CachedGainRL are valid
	bool CachedGainIsValid = false;
	};

	protected:
	/// Compute the move gain for uniform log-gap cost
	static float moveGain(const BPFunctionNode &N, bool FromLeftToRight,
	const SignaturesT &Signatures);
	friend class BalancedPartitioningTest_MoveGain_Test;
	};

	} // end namespace llvm

	#endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H