Merge 2b9968a for LLVM update to 339409

Change-Id: I389c0d2ab0bbd0ca55507c27dace4917a3dd5f6f
diff --git a/README.rst b/README.rst
index 9fb34df..63afb70 100644
--- a/README.rst
+++ b/README.rst
@@ -194,7 +194,7 @@
 **LIBOMP_OMPT_SUPPORT** = ``ON|OFF``
   Include support for the OpenMP Tools Interface (OMPT). 
   This option is supported and ``ON`` by default for x86, x86_64, AArch64, and 
-  PPC64 on Linux*, Windows*, and macOS*.
+  PPC64 on Linux* and macOS*.
   This option is ``OFF`` if this feature is not supported for the platform.
 
 **LIBOMP_OMPT_OPTIONAL** = ``ON|OFF``
diff --git a/cmake/OpenMPTesting.cmake b/cmake/OpenMPTesting.cmake
index 165435f..1514d99 100644
--- a/cmake/OpenMPTesting.cmake
+++ b/cmake/OpenMPTesting.cmake
@@ -87,7 +87,9 @@
 
     # Determine major version.
     string(REGEX MATCH "[0-9]+" major "${OPENMP_TEST_C_COMPILER_VERSION}")
+    string(REGEX MATCH "[0-9]+\\.[0-9]+" majorminor "${OPENMP_TEST_C_COMPILER_VERSION}")
     set(OPENMP_TEST_COMPILER_VERSION_MAJOR "${major}" PARENT_SCOPE)
+    set(OPENMP_TEST_COMPILER_VERSION_MAJOR_MINOR "${majorminor}" PARENT_SCOPE)
   endif()
 endfunction()
 
@@ -117,6 +119,7 @@
   # Cannot use CLANG_VERSION because we are not guaranteed that this is already set.
   set(OPENMP_TEST_COMPILER_VERSION "${LLVM_VERSION}")
   set(OPENMP_TEST_COMPILER_VERSION_MAJOR "${LLVM_MAJOR_VERSION}")
+  set(OPENMP_TEST_COMPILER_VERSION_MAJOR_MINOR "${LLVM_MAJOR_VERSION}.${LLVM_MINOR_VERSION}")
   # TODO: Implement blockaddress in GlobalISel and remove this flag!
   set(OPENMP_TEST_COMPILER_OPENMP_FLAGS "-fopenmp -fno-experimental-isel")
 endif()
@@ -131,7 +134,7 @@
     # Just use the lowercase of the compiler ID as fallback.
     string(TOLOWER "${OPENMP_TEST_COMPILER_ID}" comp)
   endif()
-  set(OPENMP_TEST_COMPILER_FEATURES "['${comp}', '${comp}-${OPENMP_TEST_COMPILER_VERSION_MAJOR}', '${comp}-${OPENMP_TEST_COMPILER_VERSION}']" PARENT_SCOPE)
+  set(OPENMP_TEST_COMPILER_FEATURES "['${comp}', '${comp}-${OPENMP_TEST_COMPILER_VERSION_MAJOR}', '${comp}-${OPENMP_TEST_COMPILER_VERSION_MAJOR_MINOR}', '${comp}-${OPENMP_TEST_COMPILER_VERSION}']" PARENT_SCOPE)
 endfunction()
 set_test_compiler_features()
 
diff --git a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
index 8340c08..2a1709f 100644
--- a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -44,7 +44,7 @@
 }
 // Return true if this is the master thread.
 __device__ static bool IsMasterThread() {
-  return getMasterThreadId() == getThreadId();
+  return !isSPMDMode() && getMasterThreadId() == getThreadId();
 }
 
 /// Return the provided size aligned to the size of a pointer.
@@ -330,39 +330,40 @@
 // Runtime functions for trunk data sharing scheme.
 ////////////////////////////////////////////////////////////////////////////////
 
+INLINE void data_sharing_init_stack_common() {
+  omptarget_nvptx_TeamDescr *teamDescr =
+      &omptarget_nvptx_threadPrivateContext->TeamContext();
+
+  for (int WID = 0; WID < WARPSIZE; WID++) {
+    __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
+    DataSharingState.SlotPtr[WID] = RootS;
+    DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+  }
+}
+
 // Initialize data sharing data structure. This function needs to be called
 // once at the beginning of a data sharing context (coincides with the kernel
-// initialization).
+// initialization). This function is called only by the MASTER thread of each
+// team in non-SPMD mode.
 EXTERN void __kmpc_data_sharing_init_stack() {
   // This function initializes the stack pointer with the pointer to the
   // statically allocated shared memory slots. The size of a shared memory
   // slot is pre-determined to be 256 bytes.
+  data_sharing_init_stack_common();
+  omptarget_nvptx_globalArgs.Init();
+}
 
-  // Initialize the data sharing structures. This section should only be
-  // executed by the warp active master threads.
-  if (IsWarpMasterActiveThread()) {
-    unsigned WID = getWarpId();
-    omptarget_nvptx_TeamDescr *teamDescr =
-        &omptarget_nvptx_threadPrivateContext->TeamContext();
-    __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID, IsMasterThread());
+// Initialize data sharing data structure. This function needs to be called
+// once at the beginning of a data sharing context (coincides with the kernel
+// initialization). This function is called in SPMD mode only.
+EXTERN void __kmpc_data_sharing_init_stack_spmd() {
+  // This function initializes the stack pointer with the pointer to the
+  // statically allocated shared memory slots. The size of a shared memory
+  // slot is pre-determined to be 256 bytes.
+  if (threadIdx.x == 0)
+    data_sharing_init_stack_common();
 
-    // If a valid address has been returned then proceed with the initalization.
-    // Otherwise the initialization of the slot has already happened in a
-    // previous call to this function.
-    if (RootS) {
-      DataSharingState.SlotPtr[WID] = RootS;
-      DataSharingState.TailPtr[WID] = RootS;
-      DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
-    }
-  }
-
-  // Currently we only support the sharing of variables between master and
-  // workers. The list of references to shared variables exists only for
-  // the master thread.
-  if (IsMasterThread()) {
-    // Initialize the list of references to arguments.
-    omptarget_nvptx_globalArgs.Init();
-  }
+  __threadfence_block();
 }
 
 // Called at the time of the kernel initialization. This is used to initilize
@@ -372,8 +373,6 @@
 // By default the globalized variables are stored in global memory. If the
 // UseSharedMemory is set to true, the runtime will attempt to use shared memory
 // as long as the size requested fits the pre-allocated size.
-//
-// Called by: master, TODO: call by workers
 EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
     int16_t UseSharedMemory) {
   // Frame pointer must be visible to all workers in the same warp.
@@ -385,7 +384,6 @@
     // SlotP will point to either the shared memory slot or an existing
     // global memory slot.
     __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
-    __kmpc_data_sharing_slot *&TailSlotP = DataSharingState.TailPtr[WID];
     void *&StackP = DataSharingState.StackPtr[WID];
 
     // Compute the total memory footprint of the requested data.
@@ -405,62 +403,31 @@
     // of the slot then we need to either re-use the next slot, if one exists,
     // or create a new slot.
     if (EndAddress < RequestedEndAddress) {
+      __kmpc_data_sharing_slot *NewSlot = 0;
       size_t NewSize = PushSize;
 
-      // The new or reused slot for holding the data being pushed.
-      __kmpc_data_sharing_slot *NewSlot = 0;
+      // Allocate at least the default size for each type of slot.
+      // Master is a special case and even though there is only one thread,
+      // it can share more things with the workers. For uniformity, it uses
+      // the full size of a worker warp slot.
+      size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
+      if (DefaultSlotSize > NewSize)
+        NewSize = DefaultSlotSize;
+      NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
+          sizeof(__kmpc_data_sharing_slot) + NewSize,
+          "Global memory slot allocation.");
 
-      // Check if there is a next slot.
-      if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
-        // Attempt to reuse an existing slot provided the data fits in the slot.
-        // The leftover data space will not be used.
-        ptrdiff_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
-                                     (uintptr_t)(&ExistingSlot->Data[0]);
+      NewSlot->Next = 0;
+      NewSlot->Prev = SlotP;
+      NewSlot->PrevSlotStackPtr = StackP;
+      NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
 
-        // Try to add the data in the next available slot. Search for a slot
-        // with enough space.
-        while (ExistingSlotSize < NewSize) {
-          SlotP->Next = ExistingSlot->Next;
-          SlotP->Next->Prev = ExistingSlot->Prev;
-          free(ExistingSlot);
-          ExistingSlot = SlotP->Next;
-          if (!ExistingSlot)
-            break;
-          ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
-                             (uintptr_t)(&ExistingSlot->Data[0]);
-        }
-
-        // Check if a slot has been found.
-        if (ExistingSlotSize >= NewSize) {
-          NewSlot = ExistingSlot;
-          NewSlot->PrevSlotStackPtr = StackP;
-        }
-      }
-
-      if (!NewSlot) {
-        // Allocate at least the default size for each type of slot.
-        size_t DefaultSlotSize =
-            IsMasterThread() ? DS_Slot_Size : DS_Worker_Warp_Slot_Size;
-        if (DefaultSlotSize > NewSize)
-          NewSize = DefaultSlotSize;
-        NewSlot = (__kmpc_data_sharing_slot *)malloc(
-            sizeof(__kmpc_data_sharing_slot) + NewSize);
-        NewSlot->Next = 0;
-        NewSlot->Prev = SlotP;
-        NewSlot->PrevSlotStackPtr = StackP;
-        NewSlot->DataEnd = &NewSlot->Data[NewSize];
-
-        // Newly allocated slots are also tail slots.
-        TailSlotP = NewSlot;
-
-        // Make previous slot point to the newly allocated slot.
-        SlotP->Next = NewSlot;
-      }
-
+      // Make previous slot point to the newly allocated slot.
+      SlotP->Next = NewSlot;
       // The current slot becomes the new slot.
       SlotP = NewSlot;
       // The stack pointer always points to the next free stack frame.
-      StackP = &NewSlot->Data[PushSize];
+      StackP = &NewSlot->Data[0] + PushSize;
       // The frame pointer always points to the beginning of the frame.
       FrameP = &NewSlot->Data[0];
     } else {
@@ -489,37 +456,27 @@
   if (IsWarpMasterActiveThread()) {
     unsigned WID = getWarpId();
 
+    // Current slot
     __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+
+    // Pointer to next available stack.
     void *&StackP = DataSharingState.StackPtr[WID];
 
-    // Pop current frame from slot.
+    // If the current slot is empty, we need to free the slot after the
+    // pop.
+    bool SlotEmpty = (StackP == &SlotP->Data[0]);
+
+    // Pop the frame.
     StackP = FrameStart;
 
-    // If we try to pop the last frame of the current slot we need to
-    // move to the previous slot if there is one.
-    const uintptr_t StartAddress = (uintptr_t)FrameStart;
-    if (StartAddress == (uintptr_t)&SlotP->Data[0]) {
-      if (SlotP->Prev) {
-        // The new stack pointer is the end of the data field of the
-        // previous slot. This will allow the stack pointer to be
-        // used in the computation of the remaining data space in
-        // the current slot.
-        StackP = SlotP->PrevSlotStackPtr;
-        // Reset SlotP to previous slot.
-        SlotP = SlotP->Prev;
-      }
+    if (SlotEmpty && SlotP->Prev) {
+      // Before removing the slot we need to reset StackP.
+      StackP = SlotP->PrevSlotStackPtr;
 
-      // If this will "pop" the last global memory node then it is likely
-      // that we are at the end of the data sharing region and we can
-      // de-allocate any existing global memory slots.
-      if (!SlotP->Prev) {
-        __kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr[WID];
-        while(Tail->Prev) {
-          Tail = Tail->Prev;
-          free(Tail->Next);
-        }
-        Tail->Next=0;
-      }
+      // Remove the slot.
+      SlotP = SlotP->Prev;
+      SafeFree(SlotP->Next, "Free slot.");
+      SlotP->Next = 0;
     }
   }
 
diff --git a/libomptarget/deviceRTLs/nvptx/src/interface.h b/libomptarget/deviceRTLs/nvptx/src/interface.h
index a02d962..680df48 100644
--- a/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ b/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -485,6 +485,7 @@
 
 
 EXTERN void __kmpc_data_sharing_init_stack();
+EXTERN void __kmpc_data_sharing_init_stack_spmd();
 EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
 EXTERN void __kmpc_data_sharing_pop_stack(void *a);
 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu
index 91eaaca..f3e475d 100644
--- a/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -86,7 +86,7 @@
 
     T inputUb = ub;
     ub = lb + chunk - 1; // Clang uses i <= ub
-    last = ub == inputUb;
+    last = lb <= inputUb && inputUb <= ub;
     stride = loopSize; // make sure we only do 1 chunk per warp
   }
 
@@ -240,12 +240,13 @@
 
     // Process schedule.
     if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
+      if (OrderedSchedule(schedule))
+        __kmpc_barrier(loc, threadId);
       PRINT(LD_LOOP,
             "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
             (long)tnum, P64(tripCount), schedule);
       schedule = kmp_sched_static_chunk;
       chunk = tripCount; // one thread gets the whole loop
-
     } else if (schedule == kmp_sched_runtime) {
       // process runtime
       omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
@@ -293,7 +294,9 @@
       // compute static chunk
       ST stride;
       int lastiter = 0;
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      ForStaticChunk(
+          lastiter, lb, ub, stride, chunk,
+          GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum);
       // save computed params
       omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
@@ -315,7 +318,9 @@
       // compute static chunk
       ST stride;
       int lastiter = 0;
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      ForStaticNoChunk(
+          lastiter, lb, ub, stride, chunk,
+          GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum);
       // save computed params
       omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
@@ -329,24 +334,18 @@
             omptarget_nvptx_threadPrivateContext->Stride(tid));
 
     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
-      if (isSPMDMode())
-        __syncthreads();
-      else
-        __kmpc_barrier(loc, threadId);
+      __kmpc_barrier(loc, threadId);
       // save sched state
+      int teamId = GetOmpTeamId();
       omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
       if (GetThreadIdInBlock() == 0) {
         if (chunk < 1)
           chunk = 1;
-        int teamId = GetOmpTeamId();
         omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk;
         omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
         omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
       }
-      if (isSPMDMode())
-        __syncthreads();
-      else
-        __kmpc_barrier(loc, threadId);
+      __kmpc_barrier(loc, threadId);
       PRINT(LD_LOOP,
             "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
             ", chunk %" PRIu64 "\n",
@@ -361,10 +360,11 @@
   // Support for dispatch next
 
   INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
-                                     Counter &loopLowerBound,
+                                     int64_t &loopLowerBound,
                                      T loopUpperBound) {
     // calculate lower bound for all lanes in the warp
-    lb = atomicAdd(&loopLowerBound, (Counter)chunkSize);
+    lb = atomicAdd((unsigned long long *)&loopLowerBound,
+                   (unsigned long long)chunkSize);
     ub = lb + chunkSize - 1;  // Clang uses i <= ub
 
     // 3 result cases:
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 2bc5819..84c61f9 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -122,7 +122,6 @@
 struct DataSharingStateTy {
   __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
   void *StackPtr[DS_Max_Warp_Number];
-  __kmpc_data_sharing_slot *TailPtr[DS_Max_Warp_Number];
   void *FramePtr[DS_Max_Warp_Number];
   int32_t ActiveThreads[DS_Max_Warp_Number];
 };
@@ -192,6 +191,8 @@
   INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
   INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
                                    uint16_t tid, uint16_t tnum);
+  INLINE void SaveLoopData();
+  INLINE void RestoreLoopData() const;
 
 private:
   // bits for flags: (7 used, 1 free)
@@ -207,6 +208,14 @@
   static const uint8_t TaskDescr_IsParConstr = 0x20;
   static const uint8_t TaskDescr_InParL2P = 0x40;
 
+  struct SavedLoopDescr_items {
+    int64_t loopUpperBound;
+    int64_t nextLowerBound;
+    int64_t chunk;
+    int64_t stride;
+    kmp_sched_t schedule;
+  } loopData;
+
   struct TaskDescr_items {
     uint8_t flags; // 6 bit used (see flag above)
     uint8_t unused;
@@ -292,6 +301,16 @@
     return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
   }
 
+  INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
+    worker_rootS[wid].DataEnd =
+        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
+    // We currently do not have a next slot.
+    worker_rootS[wid].Next = 0;
+    worker_rootS[wid].Prev = 0;
+    worker_rootS[wid].PrevSlotStackPtr = 0;
+    return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
+  }
+
 private:
   omptarget_nvptx_TaskDescr
       levelZeroTaskDescr; // icv for team master initial thread
@@ -301,7 +320,7 @@
   uint64_t lastprivateIterBuffer;
 
   __align__(16)
-      __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE - 1];
+      __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
   __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
 };
 
@@ -335,16 +354,8 @@
   INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
   INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
   INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
-  // state for dispatch with dyn/guided
-  INLINE Counter &CurrentEvent(int tid) {
-    return currEvent_or_nextLowerBound[tid];
-  }
-  INLINE Counter &EventsNumber(int tid) { return eventsNum_or_stride[tid]; }
-  // state for dispatch with static
-  INLINE Counter &NextLowerBound(int tid) {
-    return currEvent_or_nextLowerBound[tid];
-  }
-  INLINE Counter &Stride(int tid) { return eventsNum_or_stride[tid]; }
+  INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; }
+  INLINE int64_t &Stride(int tid) { return stride[tid]; }
 
   INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
 
@@ -373,8 +384,8 @@
   int64_t chunk[MAX_THREADS_PER_TEAM];
   int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
   // state for dispatch with dyn/guided OR static (never use both at a time)
-  Counter currEvent_or_nextLowerBound[MAX_THREADS_PER_TEAM];
-  Counter eventsNum_or_stride[MAX_THREADS_PER_TEAM];
+  int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
+  int64_t stride[MAX_THREADS_PER_TEAM];
   // Queue to which this object must be returned.
   uint64_t SourceQueue;
 };
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
index 435a034..086f4c5 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
@@ -125,6 +125,30 @@
   items.threadId = tid;
 }
 
+INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
+  loopData.loopUpperBound =
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
+  loopData.nextLowerBound =
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
+  loopData.schedule =
+      omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
+  loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
+  loopData.stride =
+      omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
+}
+
+INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
+  omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
+  omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
+      loopData.loopUpperBound;
+  omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
+      loopData.nextLowerBound;
+  omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
+      loopData.stride;
+  omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
+      loopData.schedule;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Thread Private Context
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index d454628..33509b6 100644
--- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -386,6 +386,7 @@
 
   // get current task
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
+  currTaskDescr->SaveLoopData();
 
   // allocate new task descriptor and copy value from current one, set prev to
   // it
@@ -417,6 +418,8 @@
       threadId, currTaskDescr->GetPrevTaskDescr());
   // free
   SafeFree(currTaskDescr, (char *)"new seq parallel task");
+  currTaskDescr = getMyTopTaskDescriptor(threadId);
+  currTaskDescr->RestoreLoopData();
 }
 
 EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) {
diff --git a/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/libomptarget/deviceRTLs/nvptx/src/reduction.cu
index afa8e81..b813a11 100644
--- a/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -25,9 +25,8 @@
   if (nt != blockDim.x)
     return 0;
   unsigned tnum = __ACTIVEMASK();
-  if (tnum != (~0x0)) { // assume swapSize is 32
+  if (tnum != (~0x0)) // assume swapSize is 32
     return 0;
-  }
   return 1;
 }
 
@@ -48,32 +47,21 @@
 
   if (numthread == 1)
     return 1;
-  else if (!__gpu_block_reduce())
+  if (!__gpu_block_reduce())
     return 2;
-  else {
-    if (threadIdx.x == 0)
-      return 1;
-    else
-      return 0;
-  }
+  if (threadIdx.x == 0)
+    return 1;
+  return 0;
 }
 
 EXTERN
 int32_t __kmpc_reduce_combined(kmp_Indent *loc) {
-  if (threadIdx.x == 0) {
-    return 2;
-  } else {
-    return 0;
-  }
+  return threadIdx.x == 0 ? 2 : 0;
 }
 
 EXTERN
 int32_t __kmpc_reduce_simd(kmp_Indent *loc) {
-  if (threadIdx.x % 32 == 0) {
-    return 1;
-  } else {
-    return 0;
-  }
+  return (threadIdx.x % 32 == 0) ? 1 : 0;
 }
 
 EXTERN
@@ -161,6 +149,11 @@
                                      kmp_InterWarpCopyFctPtr cpyFct,
                                      bool isSPMDExecutionMode,
                                      bool isRuntimeUninitialized = false) {
+  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
+  uint32_t NumThreads = GetNumberOfOmpThreads(
+      BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
+  if (NumThreads == 1)
+    return 1;
   /*
    * This reduce function handles reduction within a team. It handles
    * parallel regions in both L1 and L2 parallelism levels. It also
@@ -173,9 +166,6 @@
    */
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
-  uint32_t NumThreads = GetNumberOfOmpThreads(
-      BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
   uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
   uint32_t WarpId = BlockThreadId / WARPSIZE;
 
@@ -219,10 +209,6 @@
                                     // early.
     return gpu_irregular_simd_reduce(reduce_data, shflFct);
 
-  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
-  uint32_t NumThreads = GetNumberOfOmpThreads(
-      BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
-
   // When we have more than [warpsize] number of threads
   // a block reduction is performed here.
   //
diff --git a/libomptarget/deviceRTLs/nvptx/src/supporti.h b/libomptarget/deviceRTLs/nvptx/src/supporti.h
index 2b267c3..4de2039 100644
--- a/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -155,8 +155,7 @@
 INLINE int GetNumberOfProcsInDevice() {
   if (isGenericMode())
     return GetNumberOfWorkersInTeam();
-  else
-    return GetNumberOfThreadsInBlock();
+  return GetNumberOfThreadsInBlock();
 }
 
 INLINE int GetNumberOfProcsInTeam() { return GetNumberOfProcsInDevice(); }
diff --git a/libomptarget/deviceRTLs/nvptx/src/sync.cu b/libomptarget/deviceRTLs/nvptx/src/sync.cu
index a577d7a..7e55df8 100644
--- a/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -35,40 +35,40 @@
 
 EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) {
   PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
-  __syncthreads();
+  __kmpc_barrier(loc_ref, tid);
   PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
   return 0;
 }
 
 EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
-  tid = GetLogicalThreadIdInBlock();
-  omptarget_nvptx_TaskDescr *currTaskDescr =
-      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
-  if (!currTaskDescr->InL2OrHigherParallelRegion()) {
-    int numberOfActiveOMPThreads =
-        GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-    // On Volta and newer architectures we require that all lanes in
-    // a warp (at least, all present for the kernel launch) participate in the
-    // barrier.  This is enforced when launching the parallel region.  An
-    // exception is when there are < WARPSIZE workers.  In this case only 1
-    // worker is started, so we don't need a barrier.
+  if (isRuntimeUninitialized()) {
+    if (isSPMDMode())
+      __kmpc_barrier_simple_spmd(loc_ref, tid);
+    else
+      __kmpc_barrier_simple_generic(loc_ref, tid);
+  } else {
+    tid = GetLogicalThreadIdInBlock();
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
+    int numberOfActiveOMPThreads = GetNumberOfOmpThreads(
+        tid, isSPMDMode(), /*isRuntimeUninitialized=*/false);
     if (numberOfActiveOMPThreads > 1) {
-#endif
-      // The #threads parameter must be rounded up to the WARPSIZE.
-      int threads =
-          WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
+      if (isSPMDMode()) {
+        __kmpc_barrier_simple_spmd(loc_ref, tid);
+      } else {
+        // The #threads parameter must be rounded up to the WARPSIZE.
+        int threads =
+            WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
 
-      PRINT(LD_SYNC,
-            "call kmpc_barrier with %d omp threads, sync parameter %d\n",
-            numberOfActiveOMPThreads, threads);
-      // Barrier #1 is for synchronization among active threads.
-      named_sync(L1_BARRIER, threads);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+        PRINT(LD_SYNC,
+              "call kmpc_barrier with %d omp threads, sync parameter %d\n",
+              numberOfActiveOMPThreads, threads);
+        // Barrier #1 is for synchronization among active threads.
+        named_sync(L1_BARRIER, threads);
+      }
     } // numberOfActiveOMPThreads > 1
-#endif
+    PRINT0(LD_SYNC, "completed kmpc_barrier\n");
   }
-  PRINT0(LD_SYNC, "completed kmpc_barrier\n");
 }
 
 // Emit a simple barrier call in SPMD mode.  Assumes the caller is in an L0
diff --git a/libomptarget/include/omptarget.h b/libomptarget/include/omptarget.h
index 7135257..e92a94b 100644
--- a/libomptarget/include/omptarget.h
+++ b/libomptarget/include/omptarget.h
@@ -48,7 +48,7 @@
   OMP_TGT_MAPTYPE_LITERAL         = 0x100,
   // mapping is implicit
   OMP_TGT_MAPTYPE_IMPLICIT        = 0x200,
-  // member of struct, member given by 16 MSBs - 1
+  // member of struct, member given by [16 MSBs] - 1
   OMP_TGT_MAPTYPE_MEMBER_OF       = 0xffff000000000000
 };
 
diff --git a/libomptarget/plugins/cuda/src/rtl.cpp b/libomptarget/plugins/cuda/src/rtl.cpp
index fe2f9f6..872e7f0 100644
--- a/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/libomptarget/plugins/cuda/src/rtl.cpp
@@ -92,7 +92,7 @@
 
 /// Class containing all the device information.
 class RTLDeviceInfoTy {
-  std::vector<FuncOrGblEntryTy> FuncGblEntries;
+  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
 
 public:
   int NumberOfDevices;
@@ -122,7 +122,7 @@
   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
     assert(device_id < (int32_t)FuncGblEntries.size() &&
            "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
 
     E.Entries.push_back(entry);
   }
@@ -131,7 +131,7 @@
   bool findOffloadEntry(int32_t device_id, void *addr) {
     assert(device_id < (int32_t)FuncGblEntries.size() &&
            "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
 
     for (auto &it : E.Entries) {
       if (it.addr == addr)
@@ -145,7 +145,7 @@
   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
     assert(device_id < (int32_t)FuncGblEntries.size() &&
            "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
 
     int32_t size = E.Entries.size();
 
@@ -167,7 +167,8 @@
   void clearOffloadEntriesTable(int32_t device_id) {
     assert(device_id < (int32_t)FuncGblEntries.size() &&
            "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+    FuncGblEntries[device_id].emplace_back();
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
     E.Entries.clear();
     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
   }
diff --git a/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
index 558388d..951710a 100644
--- a/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
+++ b/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
@@ -66,7 +66,7 @@
 
 /// Class containing all the device information.
 class RTLDeviceInfoTy {
-  std::vector<FuncOrGblEntryTy> FuncGblEntries;
+  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
 
 public:
   std::list<DynLibTy> DynLibs;
@@ -76,7 +76,8 @@
                           __tgt_offload_entry *end) {
     assert(device_id < (int32_t)FuncGblEntries.size() &&
            "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+    FuncGblEntries[device_id].emplace_back();
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
 
     E.Table.EntriesBegin = begin;
     E.Table.EntriesEnd = end;
@@ -86,7 +87,7 @@
   bool findOffloadEntry(int32_t device_id, void *addr) {
     assert(device_id < (int32_t)FuncGblEntries.size() &&
            "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
 
     for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd;
          i < e; ++i) {
@@ -101,7 +102,7 @@
   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
     assert(device_id < (int32_t)FuncGblEntries.size() &&
            "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
 
     return &E.Table;
   }
diff --git a/libomptarget/src/interface.cpp b/libomptarget/src/interface.cpp
index a48d667..0f32f4e 100644
--- a/libomptarget/src/interface.cpp
+++ b/libomptarget/src/interface.cpp
@@ -33,265 +33,36 @@
   RTLs.UnregisterLib(desc);
 }
 
-// Following datatypes and functions (tgt_oldmap_type, combined_entry_t,
-// translate_map, cleanup_map) will be removed once the compiler starts using
-// the new map types.
-
-// Old map types
-enum tgt_oldmap_type {
-  OMP_TGT_OLDMAPTYPE_TO          = 0x001, // copy data from host to device
-  OMP_TGT_OLDMAPTYPE_FROM        = 0x002, // copy data from device to host
-  OMP_TGT_OLDMAPTYPE_ALWAYS      = 0x004, // copy regardless of the ref. count
-  OMP_TGT_OLDMAPTYPE_DELETE      = 0x008, // force unmapping of data
-  OMP_TGT_OLDMAPTYPE_MAP_PTR     = 0x010, // map pointer as well as pointee
-  OMP_TGT_OLDMAPTYPE_FIRST_MAP   = 0x020, // first occurrence of mapped variable
-  OMP_TGT_OLDMAPTYPE_RETURN_PTR  = 0x040, // return TgtBase addr of mapped data
-  OMP_TGT_OLDMAPTYPE_PRIVATE_PTR = 0x080, // private variable - not mapped
-  OMP_TGT_OLDMAPTYPE_PRIVATE_VAL = 0x100  // copy by value - not mapped
-};
-
-// Temporary functions for map translation and cleanup
-struct combined_entry_t {
-  int num_members; // number of members in combined entry
-  void *base_addr; // base address of combined entry
-  void *begin_addr; // begin address of combined entry
-  void *end_addr; // size of combined entry
-};
-
-static void translate_map(int32_t arg_num, void **args_base, void **args,
-    int64_t *arg_sizes, int64_t *arg_types, int32_t &new_arg_num,
-    void **&new_args_base, void **&new_args, int64_t *&new_arg_sizes,
-    int64_t *&new_arg_types, bool is_target_construct) {
-  if (arg_num <= 0) {
-    DP("Nothing to translate\n");
-    new_arg_num = 0;
-    return;
-  }
-
-  // array of combined entries
-  combined_entry_t *cmb_entries =
-      (combined_entry_t *) alloca(arg_num * sizeof(combined_entry_t));
-  // number of combined entries
-  long num_combined = 0;
-  // old entry is MAP_PTR?
-  bool *is_ptr_old = (bool *) alloca(arg_num * sizeof(bool));
-  // old entry is member of member_of[old] cmb_entry
-  int *member_of = (int *) alloca(arg_num * sizeof(int));
-  // temporary storage for modifications of the original arg_types
-  int64_t *mod_arg_types = (int64_t *) alloca(arg_num  *sizeof(int64_t));
-
-  DP("Translating %d map entries\n", arg_num);
-  for (int i = 0; i < arg_num; ++i) {
-    member_of[i] = -1;
-    is_ptr_old[i] = false;
-    mod_arg_types[i] = arg_types[i];
-    // Scan previous entries to see whether this entry shares the same base
-    for (int j = 0; j < i; ++j) {
-      void *new_begin_addr = NULL;
-      void *new_end_addr = NULL;
-
-      if (mod_arg_types[i] & OMP_TGT_OLDMAPTYPE_MAP_PTR) {
-        if (args_base[i] == args[j]) {
-          if (!(mod_arg_types[j] & OMP_TGT_OLDMAPTYPE_MAP_PTR)) {
-            DP("Entry %d has the same base as entry %d's begin address\n", i,
-                j);
-            new_begin_addr = args_base[i];
-            new_end_addr = (char *)args_base[i] + sizeof(void *);
-            assert(arg_sizes[j] == sizeof(void *));
-            is_ptr_old[j] = true;
-          } else {
-            DP("Entry %d has the same base as entry %d's begin address, but "
-                "%d's base was a MAP_PTR too\n", i, j, j);
-            int32_t to_from_always_delete =
-                OMP_TGT_OLDMAPTYPE_TO | OMP_TGT_OLDMAPTYPE_FROM |
-                OMP_TGT_OLDMAPTYPE_ALWAYS | OMP_TGT_OLDMAPTYPE_DELETE;
-            if (mod_arg_types[j] & to_from_always_delete) {
-              DP("Resetting to/from/always/delete flags for entry %d because "
-                  "it is only a pointer to pointer\n", j);
-              mod_arg_types[j] &= ~to_from_always_delete;
-            }
-          }
-        }
-      } else {
-        if (!(mod_arg_types[i] & OMP_TGT_OLDMAPTYPE_FIRST_MAP) &&
-            args_base[i] == args_base[j]) {
-          DP("Entry %d has the same base address as entry %d\n", i, j);
-          new_begin_addr = args[i];
-          new_end_addr = (char *)args[i] + arg_sizes[i];
-        }
-      }
-
-      // If we have combined the entry with a previous one
-      if (new_begin_addr) {
-        int id;
-        if(member_of[j] == -1) {
-          // We have a new entry
-          id = num_combined++;
-          DP("Creating new combined entry %d for old entry %d\n", id, j);
-          // Initialize new entry
-          cmb_entries[id].num_members = 1;
-          cmb_entries[id].base_addr = args_base[j];
-          if (mod_arg_types[j] & OMP_TGT_OLDMAPTYPE_MAP_PTR) {
-            cmb_entries[id].begin_addr = args_base[j];
-            cmb_entries[id].end_addr = (char *)args_base[j] + arg_sizes[j];
-          } else {
-            cmb_entries[id].begin_addr = args[j];
-            cmb_entries[id].end_addr = (char *)args[j] + arg_sizes[j];
-          }
-          member_of[j] = id;
-        } else {
-          // Reuse existing combined entry
-          DP("Reusing existing combined entry %d\n", member_of[j]);
-          id = member_of[j];
-        }
-
-        // Update combined entry
-        DP("Adding entry %d to combined entry %d\n", i, id);
-        cmb_entries[id].num_members++;
-        // base_addr stays the same
-        cmb_entries[id].begin_addr =
-            std::min(cmb_entries[id].begin_addr, new_begin_addr);
-        cmb_entries[id].end_addr =
-            std::max(cmb_entries[id].end_addr, new_end_addr);
-        member_of[i] = id;
-        break;
-      }
-    }
-  }
-
-  DP("New entries: %ld combined + %d original\n", num_combined, arg_num);
-  new_arg_num = arg_num + num_combined;
-  new_args_base = (void **) malloc(new_arg_num * sizeof(void *));
-  new_args = (void **) malloc(new_arg_num * sizeof(void *));
-  new_arg_sizes = (int64_t *) malloc(new_arg_num * sizeof(int64_t));
-  new_arg_types = (int64_t *) malloc(new_arg_num * sizeof(int64_t));
-
-  const int64_t alignment = 8;
-
-  int next_id = 0; // next ID
-  int next_cid = 0; // next combined ID
-  int *combined_to_new_id = (int *) alloca(num_combined * sizeof(int));
-  for (int i = 0; i < arg_num; ++i) {
-    // It is member_of
-    if (member_of[i] == next_cid) {
-      int cid = next_cid++; // ID of this combined entry
-      int nid = next_id++; // ID of the new (global) entry
-      combined_to_new_id[cid] = nid;
-      DP("Combined entry %3d will become new entry %3d\n", cid, nid);
-
-      int64_t padding = (int64_t)cmb_entries[cid].begin_addr % alignment;
-      if (padding) {
-        DP("Using a padding of %" PRId64 " for begin address " DPxMOD "\n",
-            padding, DPxPTR(cmb_entries[cid].begin_addr));
-        cmb_entries[cid].begin_addr =
-            (char *)cmb_entries[cid].begin_addr - padding;
-      }
-
-      new_args_base[nid] = cmb_entries[cid].base_addr;
-      new_args[nid] = cmb_entries[cid].begin_addr;
-      new_arg_sizes[nid] = (int64_t) ((char *)cmb_entries[cid].end_addr -
-          (char *)cmb_entries[cid].begin_addr);
-      new_arg_types[nid] = OMP_TGT_MAPTYPE_TARGET_PARAM;
-      DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", "
-          "size %" PRId64 ", type 0x%" PRIx64 "\n", nid,
-          DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid],
-          new_arg_types[nid]);
-    } else if (member_of[i] != -1) {
-      DP("Combined entry %3d has been encountered before, do nothing\n",
-          member_of[i]);
-    }
-
-    // Now that the combined entry (the one the old entry was a member of) has
-    // been inserted into the new arguments list, proceed with the old entry.
-    int nid = next_id++;
-    DP("Old entry %3d will become new entry %3d\n", i, nid);
-
-    new_args_base[nid] = args_base[i];
-    new_args[nid] = args[i];
-    new_arg_sizes[nid] = arg_sizes[i];
-    int64_t old_type = mod_arg_types[i];
-
-    if (is_ptr_old[i]) {
-      // Reset TO and FROM flags
-      old_type &= ~(OMP_TGT_OLDMAPTYPE_TO | OMP_TGT_OLDMAPTYPE_FROM);
-    }
-
-    if (member_of[i] == -1) {
-      if (!is_target_construct)
-        old_type &= ~OMP_TGT_MAPTYPE_TARGET_PARAM;
-      new_arg_types[nid] = old_type;
-      DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", size %" PRId64
-          ", type 0x%" PRIx64 " (old entry %d not MEMBER_OF)\n", nid,
-          DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid],
-          new_arg_types[nid], i);
-    } else {
-      // Old entry is not FIRST_MAP
-      old_type &= ~OMP_TGT_OLDMAPTYPE_FIRST_MAP;
-      // Add MEMBER_OF
-      int new_member_of = combined_to_new_id[member_of[i]];
-      old_type |= ((int64_t)new_member_of + 1) << 48;
-      new_arg_types[nid] = old_type;
-      DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", size %" PRId64
-        ", type 0x%" PRIx64 " (old entry %d MEMBER_OF %d)\n", nid,
-        DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid],
-        new_arg_types[nid], i, new_member_of);
-    }
-  }
-}
-
-static void cleanup_map(int32_t new_arg_num, void **new_args_base,
-    void **new_args, int64_t *new_arg_sizes, int64_t *new_arg_types,
-    int32_t arg_num, void **args_base) {
-  if (new_arg_num > 0) {
-    int offset = new_arg_num - arg_num;
-    for (int32_t i = 0; i < arg_num; ++i) {
-      // Restore old base address
-      args_base[i] = new_args_base[i+offset];
-    }
-    free(new_args_base);
-    free(new_args);
-    free(new_arg_sizes);
-    free(new_arg_types);
-  }
-}
-
 /// creates host-to-target data mapping, stores it in the
 /// libomptarget.so internal structure (an entry in a stack of data maps)
 /// and passes the data to the device.
 EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
-  DP("Entering data begin region for device %ld with %d mappings\n", device_id,
-     arg_num);
+  DP("Entering data begin region for device %" PRId64 " with %d mappings\n",
+      device_id, arg_num);
 
   // No devices available?
   if (device_id == OFFLOAD_DEVICE_DEFAULT) {
     device_id = omp_get_default_device();
-    DP("Use default device id %ld\n", device_id);
+    DP("Use default device id %" PRId64 "\n", device_id);
   }
 
   if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %ld ready\n", device_id);
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
     return;
   }
 
   DeviceTy& Device = Devices[device_id];
 
-  // Translate maps
-  int32_t new_arg_num;
-  void **new_args_base;
-  void **new_args;
-  int64_t *new_arg_sizes;
-  int64_t *new_arg_types;
-  translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
-      new_args_base, new_args, new_arg_sizes, new_arg_types, false);
+#ifdef OMPTARGET_DEBUG
+  for (int i=0; i<arg_num; ++i) {
+    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+        arg_sizes[i], arg_types[i]);
+  }
+#endif
 
-  //target_data_begin(Device, arg_num, args_base, args, arg_sizes, arg_types);
-  target_data_begin(Device, new_arg_num, new_args_base, new_args, new_arg_sizes,
-      new_arg_types);
-
-  // Cleanup translation memory
-  cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
-      new_arg_types, arg_num, args_base);
+  target_data_begin(Device, arg_num, args_base, args, arg_sizes, arg_types);
 }
 
 EXTERN void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num,
@@ -321,32 +92,25 @@
   size_t Devices_size = Devices.size();
   RTLsMtx.unlock();
   if (Devices_size <= (size_t)device_id) {
-    DP("Device ID  %ld does not have a matching RTL.\n", device_id);
+    DP("Device ID  %" PRId64 " does not have a matching RTL.\n", device_id);
     return;
   }
 
   DeviceTy &Device = Devices[device_id];
   if (!Device.IsInit) {
-    DP("uninit device: ignore");
+    DP("Uninit device: ignore");
     return;
   }
 
-  // Translate maps
-  int32_t new_arg_num;
-  void **new_args_base;
-  void **new_args;
-  int64_t *new_arg_sizes;
-  int64_t *new_arg_types;
-  translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
-      new_args_base, new_args, new_arg_sizes, new_arg_types, false);
+#ifdef OMPTARGET_DEBUG
+  for (int i=0; i<arg_num; ++i) {
+    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+        arg_sizes[i], arg_types[i]);
+  }
+#endif
 
-  //target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types);
-  target_data_end(Device, new_arg_num, new_args_base, new_args, new_arg_sizes,
-      new_arg_types);
-
-  // Cleanup translation memory
-  cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
-      new_arg_types, arg_num, args_base);
+  target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types);
 }
 
 EXTERN void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num,
@@ -370,7 +134,7 @@
   }
 
   if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %ld ready\n", device_id);
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
     return;
   }
 
@@ -391,35 +155,28 @@
 
 EXTERN int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num,
     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
-  DP("Entering target region with entry point " DPxMOD " and device Id %ld\n",
-     DPxPTR(host_ptr), device_id);
+  DP("Entering target region with entry point " DPxMOD " and device Id %"
+      PRId64 "\n", DPxPTR(host_ptr), device_id);
 
   if (device_id == OFFLOAD_DEVICE_DEFAULT) {
     device_id = omp_get_default_device();
   }
 
   if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %ld ready\n", device_id);
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
     return OFFLOAD_FAIL;
   }
 
-  // Translate maps
-  int32_t new_arg_num;
-  void **new_args_base;
-  void **new_args;
-  int64_t *new_arg_sizes;
-  int64_t *new_arg_types;
-  translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
-      new_args_base, new_args, new_arg_sizes, new_arg_types, true);
+#ifdef OMPTARGET_DEBUG
+  for (int i=0; i<arg_num; ++i) {
+    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+        arg_sizes[i], arg_types[i]);
+  }
+#endif
 
-  //return target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
-  //    arg_types, 0, 0, false /*team*/, false /*recursive*/);
-  int rc = target(device_id, host_ptr, new_arg_num, new_args_base, new_args,
-      new_arg_sizes, new_arg_types, 0, 0, false /*team*/);
-
-  // Cleanup translation memory
-  cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
-      new_arg_types, arg_num, args_base);
+  int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
+      arg_types, 0, 0, false /*team*/);
 
   return rc;
 }
@@ -438,36 +195,28 @@
 EXTERN int __tgt_target_teams(int64_t device_id, void *host_ptr,
     int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
     int64_t *arg_types, int32_t team_num, int32_t thread_limit) {
-  DP("Entering target region with entry point " DPxMOD " and device Id %ld\n",
-     DPxPTR(host_ptr), device_id);
+  DP("Entering target region with entry point " DPxMOD " and device Id %"
+      PRId64 "\n", DPxPTR(host_ptr), device_id);
 
   if (device_id == OFFLOAD_DEVICE_DEFAULT) {
     device_id = omp_get_default_device();
   }
 
   if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %ld ready\n", device_id);
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
     return OFFLOAD_FAIL;
   }
 
-  // Translate maps
-  int32_t new_arg_num;
-  void **new_args_base;
-  void **new_args;
-  int64_t *new_arg_sizes;
-  int64_t *new_arg_types;
-  translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
-      new_args_base, new_args, new_arg_sizes, new_arg_types, true);
+#ifdef OMPTARGET_DEBUG
+  for (int i=0; i<arg_num; ++i) {
+    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+        arg_sizes[i], arg_types[i]);
+  }
+#endif
 
-  //return target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
-  //              arg_types, team_num, thread_limit, true /*team*/,
-  //              false /*recursive*/);
-  int rc = target(device_id, host_ptr, new_arg_num, new_args_base, new_args,
-      new_arg_sizes, new_arg_types, team_num, thread_limit, true /*team*/);
-
-  // Cleanup translation memory
-  cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
-      new_arg_types, arg_num, args_base);
+  int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
+      arg_types, team_num, thread_limit, true /*team*/);
 
   return rc;
 }
@@ -492,11 +241,11 @@
   }
 
   if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %ld ready\n", device_id);
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
     return;
   }
 
-  DP("__kmpc_push_target_tripcount(%ld, %" PRIu64 ")\n", device_id,
+  DP("__kmpc_push_target_tripcount(%" PRId64 ", %" PRIu64 ")\n", device_id,
       loop_tripcount);
   Devices[device_id].loopTripCnt = loop_tripcount;
 }
diff --git a/libomptarget/src/omptarget.cpp b/libomptarget/src/omptarget.cpp
index d5574fb..3cc09b4 100644
--- a/libomptarget/src/omptarget.cpp
+++ b/libomptarget/src/omptarget.cpp
@@ -25,6 +25,38 @@
 int DebugLevel = 0;
 #endif // OMPTARGET_DEBUG
 
+/* All begin addresses for partially mapped structs must be 8-aligned in order
+ * to ensure proper alignment of members. E.g.
+ *
+ * struct S {
+ *   int a;   // 4-aligned
+ *   int b;   // 4-aligned
+ *   int *p;  // 8-aligned
+ * } s1;
+ * ...
+ * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
+ * {
+ *   s1.b = 5;
+ *   for (int i...) s1.p[i] = ...;
+ * }
+ *
+ * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
+ * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
+ * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
+ * requirements for its type. Now, when we allocate memory on the device, in
+ * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
+ * This means that the chunk of the struct on the device will start at a
+ * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
+ * address of p will be a misaligned 0x204 (on the host there was no need to add
+ * padding between b and p, so p comes exactly 4 bytes after b). If the device
+ * kernel tries to access s1.p, a misaligned address error occurs (as reported
+ * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
+ * extending the size of the allocated chuck accordingly, the chuck on the
+ * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
+ * &s1.p=0x208, as they should be to satisfy the alignment requirements.
+ */
+static const int64_t alignment = 8;
+
 /// Map global data and execute pending ctors
 static int InitLibrary(DeviceTy& Device) {
   /*
@@ -172,7 +204,7 @@
   return OFFLOAD_SUCCESS;
 }
 
-static short member_of(int64_t type) {
+static int32_t member_of(int64_t type) {
   return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
 }
 
@@ -189,10 +221,33 @@
 
     void *HstPtrBegin = args[i];
     void *HstPtrBase = args_base[i];
+    int64_t data_size = arg_sizes[i];
+
+    // Adjust for proper alignment if this is a combined entry (for structs).
+    // Look at the next argument - if that is MEMBER_OF this one, then this one
+    // is a combined entry.
+    int64_t padding = 0;
+    const int next_i = i+1;
+    if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
+        member_of(arg_types[next_i]) == i) {
+      padding = (int64_t)HstPtrBegin % alignment;
+      if (padding) {
+        DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
+            "\n", padding, DPxPTR(HstPtrBegin));
+        HstPtrBegin = (char *) HstPtrBegin - padding;
+        data_size += padding;
+      }
+    }
+
     // Address of pointer on the host and device, respectively.
     void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin;
     bool IsNew, Pointer_IsNew;
     bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
+    // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
+    // have reached this point via __tgt_target_data_begin and not __tgt_target
+    // then no argument is marked as TARGET_PARAM ("omp target data map" is not
+    // associated with a target region, so there are no target parameters). This
+    // may be considered a hack, we could revise the scheme in the future.
     bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
       DP("Has a pointer entry: \n");
@@ -213,28 +268,22 @@
     }
 
     void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
-        arg_sizes[i], IsNew, IsImplicit, UpdateRef);
-    if (!TgtPtrBegin && arg_sizes[i]) {
-      // If arg_sizes[i]==0, then the argument is a pointer to NULL, so
-      // getOrAlloc() returning NULL is not an error.
+        data_size, IsNew, IsImplicit, UpdateRef);
+    if (!TgtPtrBegin && data_size) {
+      // If data_size==0, then the argument could be a zero-length pointer to
+      // NULL, so getOrAlloc() returning NULL is not an error.
       DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
           "illegal mapping).\n");
     }
     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
-        " - is%s new\n", arg_sizes[i], DPxPTR(TgtPtrBegin),
+        " - is%s new\n", data_size, DPxPTR(TgtPtrBegin),
         (IsNew ? "" : " not"));
 
     if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
-      void *ret_ptr;
-      if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
-        ret_ptr = Pointer_TgtPtrBegin;
-      else {
-        bool IsLast; // not used
-        ret_ptr = Device.getTgtPtrBegin(HstPtrBegin, 0, IsLast, false);
-      }
-
-      DP("Returning device pointer " DPxMOD "\n", DPxPTR(ret_ptr));
-      args_base[i] = ret_ptr;
+      uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
+      void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
+      DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
+      args_base[i] = TgtPtrBase;
     }
 
     if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
@@ -243,7 +292,7 @@
         copy = true;
       } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
         // Copy data only if the "parent" struct has RefCount==1.
-        short parent_idx = member_of(arg_types[i]);
+        int32_t parent_idx = member_of(arg_types[i]);
         long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
         assert(parent_rc > 0 && "parent struct not found");
         if (parent_rc == 1) {
@@ -253,8 +302,8 @@
 
       if (copy) {
         DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
-            arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
-        int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
+            data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
+        int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size);
         if (rt != OFFLOAD_SUCCESS) {
           DP("Copying data to device failed.\n");
           rc = OFFLOAD_FAIL;
@@ -297,16 +346,33 @@
       continue;
 
     void *HstPtrBegin = args[i];
+    int64_t data_size = arg_sizes[i];
+    // Adjust for proper alignment if this is a combined entry (for structs).
+    // Look at the next argument - if that is MEMBER_OF this one, then this one
+    // is a combined entry.
+    int64_t padding = 0;
+    const int next_i = i+1;
+    if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
+        member_of(arg_types[next_i]) == i) {
+      padding = (int64_t)HstPtrBegin % alignment;
+      if (padding) {
+        DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
+            "\n", padding, DPxPTR(HstPtrBegin));
+        HstPtrBegin = (char *) HstPtrBegin - padding;
+        data_size += padding;
+      }
+    }
+
     bool IsLast;
     bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
         (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
     bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
 
     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
-    void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
+    void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast,
         UpdateRef);
     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
-        " - is%s last\n", arg_sizes[i], DPxPTR(TgtPtrBegin),
+        " - is%s last\n", data_size, DPxPTR(TgtPtrBegin),
         (IsLast ? "" : " not"));
 
     bool DelEntry = IsLast || ForceDelete;
@@ -324,7 +390,7 @@
         if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
             !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
           // Copy data only if the "parent" struct has RefCount==1.
-          short parent_idx = member_of(arg_types[i]);
+          int32_t parent_idx = member_of(arg_types[i]);
           long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
           assert(parent_rc > 0 && "parent struct not found");
           if (parent_rc == 1) {
@@ -334,8 +400,8 @@
 
         if (DelEntry || Always || CopyMember) {
           DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
-              arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-          int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, arg_sizes[i]);
+              data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
+          int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size);
           if (rt != OFFLOAD_SUCCESS) {
             DP("Copying data from device failed.\n");
             rc = OFFLOAD_FAIL;
@@ -348,7 +414,7 @@
       // copies. If the struct is going to be deallocated, remove any remaining
       // shadow pointer entries for this struct.
       uintptr_t lb = (uintptr_t) HstPtrBegin;
-      uintptr_t ub = (uintptr_t) HstPtrBegin + arg_sizes[i];
+      uintptr_t ub = (uintptr_t) HstPtrBegin + data_size;
       Device.ShadowMtx.lock();
       for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
           it != Device.ShadowPtrMap.end(); ++it) {
@@ -378,7 +444,7 @@
 
       // Deallocate map
       if (DelEntry) {
-        int rt = Device.deallocTgtPtr(HstPtrBegin, arg_sizes[i], ForceDelete);
+        int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete);
         if (rt != OFFLOAD_SUCCESS) {
           DP("Deallocating data from device failed.\n");
           rc = OFFLOAD_FAIL;
diff --git a/libomptarget/test/CMakeLists.txt b/libomptarget/test/CMakeLists.txt
index 1662ff7..4e9c7ab 100644
--- a/libomptarget/test/CMakeLists.txt
+++ b/libomptarget/test/CMakeLists.txt
@@ -1,6 +1,6 @@
 # CMakeLists.txt file for unit testing OpenMP offloading runtime library.
-if(NOT "${OPENMP_TEST_COMPILER_ID}" STREQUAL "Clang" OR
-   ${OPENMP_TEST_COMPILER_VERSION} VERSION_LESS 6.0.0)
+if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang" OR
+   OPENMP_TEST_COMPILER_VERSION VERSION_LESS 6.0.0)
   libomptarget_say("Can only test with Clang compiler in version 6.0.0 or later.")
   libomptarget_warning_say("The check-libomptarget target will not be available!")
   return()
diff --git a/offload/Makefile b/offload/Makefile
deleted file mode 100755
index 75e3744..0000000
--- a/offload/Makefile
+++ /dev/null
@@ -1,224 +0,0 @@
-#

-##//===----------------------------------------------------------------------===//

-#//

-#//                     The LLVM Compiler Infrastructure

-#//

-#// This file is dual licensed under the MIT and the University of Illinois Open

-#// Source Licenses. See LICENSE.txt for details.

-#//

-#//===----------------------------------------------------------------------===//

-#

-

-# MAKEFILE PARAMETERS

-#

-# root_dir - path to root directory of liboffload

-# build_dir - path to build directory

-# mpss_dir - path to root directory of mpss

-# mpss_version - version of the mpss (e.g., version "3.3.x" would be "33")

-# libiomp_host_dir - path to host libiomp directory (unnecessary if compiler_host is icc)

-# libiomp_target_dir - path to target libiomp directory (unnecesarry if compiler_target is icc)

-# omp_header_dir - path to omp.h (unnecessary if compiler_host and compiler_target are icc)

-# os_host - host operating system

-# os_target - target operating system

-# compiler_host - host compiler

-# compiler_target - target compiler

-# options_host - additional options for host compiler

-# options_target - additional options for target compiler

-#

-

-# Directories

-root_dir?=.

-build_dir?=$(root_dir)/build

-build_host_dir=$(build_dir)/host

-build_target_dir=$(build_dir)/target

-obj_host_dir=$(build_dir)/obj_host

-obj_target_dir=$(build_dir)/obj_target

-source_dir=$(root_dir)/src

-imported_dir=$(source_dir)/imported

-

-# OS

-os_host?=linux

-os_target?=linux

-ifneq ($(os_host)_$(os_target), linux_linux)

-  $(error "Only linux is supported")

-endif

-

-# Compilers

-compiler_host?=gcc

-compiler_target?=gcc

-

-# MPSS

-mpss_version?=30

-mpss_dir?=/

-mpss_present=$(shell if test -d $(mpss_dir); then echo OK; else echo KO; fi)

-ifneq ($(mpss_present), OK)

-  $(error "Cannot find MPSS directory $(mpss_dir)")

-endif

-

-ifeq ($(shell test $(mpss_version) -gt 33; echo $$?), 0)

-  coi_dir=$(mpss_dir)/sysroots/k1om-mpss-linux/usr

-  coi_include=$(coi_dir)/include/intel-coi

-  coi_lib_host=$(mpss_dir)/lib64

-  coi_lib_device=$(coi_dir)/lib64

-else

-  coi_dir=$(mpss_dir)/opt/intel/mic/coi

-  coi_include=$(coi_dir)/include

-  coi_lib_host=$(coi_dir)/host-linux-release/lib

-  coi_lib_device=$(coi_dir)/device-linux-release/lib

-endif

-myo_dir=$(mpss_dir)/opt/intel/mic/myo

-

-# Sources

-src_liboffload_common=dv_util.cpp liboffload_error.c liboffload_msg.c offload_common.cpp offload_table.cpp offload_trace.cpp offload_util.cpp

-

-src_liboffload_host=$(src_liboffload_common) cean_util.cpp coi/coi_client.cpp compiler_if_host.cpp offload_engine.cpp offload_env.cpp offload_host.cpp offload_omp_host.cpp offload_timer_host.cpp offload_orsl.cpp orsl-lite/lib/orsl-lite.c offload_myo_host.cpp

-src_liboffload_host:=$(foreach file,$(src_liboffload_host),$(source_dir)/$(file))

-

-src_liboffload_target=$(src_liboffload_common) coi/coi_server.cpp compiler_if_target.cpp offload_omp_target.cpp offload_target.cpp offload_timer_target.cpp offload_myo_target.cpp

-src_liboffload_target:=$(foreach file,$(src_liboffload_target),$(source_dir)/$(file))

-

-src_ofld=ofldbegin.cpp ofldend.cpp

-src_ofld:=$(foreach file,$(src_ofld),$(source_dir)/$(file))

-

-headers=$(wildcard $(source_dir)/*.h) $(wildcard $(source_dir)/coi/*.h) $(wildcard $(source_dir)/orsl-lite/include/*.h)

-ifneq ($(omp_header_dir), )

-  headers+=$(imported_dir)/omp.h

-endif

-

-# Objects

-obj_liboffload_host=$(notdir $(src_liboffload_host))

-obj_liboffload_host:=$(obj_liboffload_host:.cpp=.o)

-obj_liboffload_host:=$(obj_liboffload_host:.c=.o)

-obj_liboffload_host:=$(foreach file,$(obj_liboffload_host),$(obj_host_dir)/$(file))

-

-obj_liboffload_target=$(notdir $(src_liboffload_target))

-obj_liboffload_target:=$(obj_liboffload_target:.cpp=.o)

-obj_liboffload_target:=$(obj_liboffload_target:.c=.o)

-obj_liboffload_target:=$(foreach file,$(obj_liboffload_target),$(obj_target_dir)/$(file))

-

-obj_ofld=$(notdir $(src_ofld))

-obj_ofld:=$(obj_ofld:.cpp=.o)

-obj_ofld_host=$(foreach file,$(obj_ofld),$(build_host_dir)/$(file))

-obj_ofld_target=$(foreach file,$(obj_ofld),$(build_target_dir)/$(file))

-

-# Options

-opts_common=-O2 -w -fpic -c -DCOI_LIBRARY_VERSION=2 -DMYO_SUPPORT -DOFFLOAD_DEBUG=1 -DSEP_SUPPORT -DTIMING_SUPPORT -I$(coi_include) -I$(myo_dir)/include -I$(source_dir)

-ifneq ($(omp_header_dir), )

-  opts_common+=-I$(imported_dir)

-endif

-

-opts_liboffload=-shared -Wl,-soname,liboffload.so.5 -ldl -lstdc++ -liomp5

-

-opts_liboffload_host=$(opts_liboffload) -L$(coi_lib_host) -lcoi_host -L$(myo_dir)/lib -lmyo-client

-ifneq ($(libiomp_host_dir), )

-  opts_liboffload_host+=-L$(libiomp_host_dir)

-endif

-

-opts_liboffload_target=$(opts_liboffload) -L$(coi_lib_device) -lcoi_device -L$(myo_dir)/lib -lmyo-service

-ifneq ($(libiomp_target_dir), )

-  opts_liboffload_target+=-L$(libiomp_target_dir)

-endif

-

-options_host?=

-opts_host=$(options_host) -DHOST_LIBRARY=1 -DMPSS_VERSION=$(mpss_version)

-ifeq ($(os_host), linux)

-  opts_host+=-DLINUX

-endif

-

-options_target?=

-opts_target=$(options_target) -DHOST_LIBRARY=0

-ifeq ($(os_target), linux)

-  opts_target+=-DLINUX

-endif

-ifeq ($(compiler_target), icc)

-  opts_target+=-mmic

-endif

-

-# Make targets

-.PHONY: all clean info

-

-all: info $(build_host_dir)/liboffload.so $(build_target_dir)/liboffload.so $(obj_ofld_host) $(obj_ofld_target)

-

-

-$(build_host_dir)/liboffload.so: $(build_host_dir)/liboffload.so.5 | $(build_host_dir)

-	ln -f $< $@

-

-$(build_host_dir)/liboffload.so.5: $(obj_liboffload_host) | $(build_host_dir)

-	$(compiler_host) $(opts_liboffload_host) $(opts_host) $^ -o $@

-

-$(obj_host_dir)/%.o: $(source_dir)/%.c $(headers) | $(obj_host_dir)

-	$(compiler_host) $(opts_common) $(opts_host) $< -o $@

-

-$(obj_host_dir)/%.o: $(source_dir)/%.cpp $(headers) | $(obj_host_dir)

-	$(compiler_host) $(opts_common) $(opts_host) $< -o $@

-

-$(obj_host_dir)/%.o: $(source_dir)/coi/%.cpp $(headers) | $(obj_host_dir)

-	$(compiler_host) $(opts_common) $(opts_host) $< -o $@

-

-$(obj_host_dir)/%.o: $(source_dir)/orsl-lite/lib/%.c $(headers) | $(obj_host_dir)

-	$(compiler_host) $(opts_common) $(opts_host) $< -o $@

-

-

-$(build_target_dir)/liboffload.so: $(build_target_dir)/liboffload.so.5 | $(build_target_dir)

-	ln -f $< $@

-

-$(build_target_dir)/liboffload.so.5: $(obj_liboffload_target) | $(build_target_dir)

-	$(compiler_target) $(opts_liboffload_target) $(opts_target) $^ -o $@

-

-$(obj_target_dir)/%.o: $(source_dir)/%.c $(headers) | $(obj_target_dir)

-	$(compiler_target) $(opts_common) $(opts_target) $< -o $@

-

-$(obj_target_dir)/%.o: $(source_dir)/%.cpp $(headers) | $(obj_target_dir)

-	$(compiler_target) $(opts_common) $(opts_target) $< -o $@

-

-$(obj_target_dir)/%.o: $(source_dir)/coi/%.cpp $(headers) | $(obj_target_dir)

-	$(compiler_target) $(opts_common) $(opts_target) $< -o $@

-

-$(obj_target_dir)/%.o: $(source_dir)/orsl-lite/lib/%.c $(headers) | $(obj_target_dir)

-	$(compiler_target) $(opts_common) $(opts_target) $< -o $@

-

-

-$(build_host_dir)/%.o: $(source_dir)/%.cpp $(headers) | $(build_host_dir)

-	$(compiler_host) $(opts_common) $(opts_host) $< -o $@

-

-$(build_target_dir)/%.o: $(source_dir)/%.cpp $(headers) | $(build_target_dir)

-	$(compiler_target) $(opts_common) $(opts_target) $< -o $@

-

-

-$(imported_dir)/omp.h: $(omp_header_dir)/omp.h | $(imported_dir)

-	cp $< $@

-

-

-$(build_host_dir) $(build_target_dir) $(obj_host_dir) $(obj_target_dir): | $(build_dir)

-	$(shell mkdir -p $@ >/dev/null 2>/dev/null)

-	@echo "Created $@ directory"

-

-$(build_dir):

-	$(shell mkdir -p $@ >/dev/null 2>/dev/null)

-	@echo "Created $@ directory"

-

-$(imported_dir):

-	$(shell mkdir -p $@ >/dev/null 2>/dev/null)

-	@echo "Created $@ directory"

-

-

-clean:

-	$(shell rm -rf $(build_dir))

-	@echo "Remove $(build_dir) directory"

-

-

-info:

-	@echo "root_dir = $(root_dir)"

-	@echo "build_dir = $(build_dir)"

-	@echo "mpss_dir = $(mpss_dir)"

-	@echo "mpss_version = $(mpss_version)"

-	@echo "libiomp_host_dir = $(libiomp_host_dir)"

-	@echo "libiomp_target_dir = $(libiomp_target_dir)"

-	@echo "omp_header_dir = $(omp_header_dir)"

-	@echo "os_host = $(os_host)"

-	@echo "os_target = $(os_target)"

-	@echo "compiler_host = $(compiler_host)"

-	@echo "compiler_target = $(compiler_target)"

-	@echo "options_host = $(options_host)"

-	@echo "options_target = $(options_target)"

-

diff --git a/offload/README.txt b/offload/README.txt
deleted file mode 100755
index eb9fb1d..0000000
--- a/offload/README.txt
+++ /dev/null
@@ -1,129 +0,0 @@
-

-	       README for Intel(R) Offload Runtime Library

-	       ===========================================

-

-How to Build Documentation

-==========================

-

-The main documentation is in Doxygen* format, and this distribution

-should come with pre-built PDF documentation in doc/Reference.pdf.

-However, an HTML version can be built by executing:

-

-% doxygen doc/doxygen/config

-

-in this directory.

-

-That will produce HTML documentation in the doc/doxygen/generated

-directory, which can be accessed by pointing a web browser at the

-index.html file there.

-

-If you don't have Doxygen installed, you can download it from

-www.doxygen.org.

-

-

-Software Requirements

-=====================

-

-Intel(R) Offload Runtime Library requires additional software:

-

-1) Intel(R) OpenMP* Runtime Library.  You can either download the source

-code for that (from openmprtl.org or openmp.llvm.org) or simply use the

-compiled version distributed with the Intel compilers.

-2) Intel(R) COI Runtime Library and Intel(R) MYO Runtime Library.  These

-libraries are part of Intel(R) Manycore Platform Software Stack (MPSS).  You

-can download MPSS source code or binaries from

-software.intel.com/en-us/articles/intel-manycore-platform-software-stack-mpss.

-Binaries include host libraries for Intel(R) 64 Architecture and target

-libraries for Intel(R) Many Integrated Core Architecture.

-

-Also you will require all of the libraries that enable the target code to run

-on device.  If you target the Intel(R) Xeon Phi (TM) coprocessor, these

-libraries can be taken from MPSS too.

-

-

-How to Build the Intel(R) Offload Runtime Library

-=================================================

-

-The Makefile at the top-level will attempt to detect what it needs to

-build the Intel(R) Offload Runtime Library.  To see the default settings,

-type:

-

-make info

-

-You can change the Makefile's behavior with the following options:

-

-root_dir:	      The path to the top-level directory containing the

-		      top-level Makefile.  By default, this will take on the

-		      value of the current working directory.

-

-build_dir:	      The path to the build directory.  By default, this will

-		      take on value [root_dir]/build.

-

-mpss_dir:	      The path to the Intel(R) Manycore Platform Software

-		      Stack install directory.  By default, this will take on

-		      the value of operating system's root directory.

-

-libiomp_host_dir:     The path to the host Intel(R) OpenMP* Runtime Library.

-		      This option is required when the host compiler is other

-		      than icc.

-

-libiomp_target_dir:   The path to the target Intel(R) OpenMP* Runtime

-		      Library.  This option is required when the target

-		      compiler is other than icc.

-

-omp_header_dir:       The path to the header file <omp.h> of Intel(R) OpenMP*

-		      Runtime Library.  This option is required if either host

-		      or target compiler is other than icc.

-

-os_host:	      Operating system on host.  Currently supports only

-		      "linux" which is set by default.

-

-os_target:	      Operating system on target device.  Currently supports

-		      only "linux" which is set by default.

-

-compiler_host:	      Which compiler to use for the build of the host part.

-		      Defaults to "gcc"*.  Also supports "icc" and "clang"*.

-		      You should provide the full path to the compiler or it

-		      should be in the user's path.

-

-compiler_host:	      Which compiler to use for the build of the target part.

-		      Defaults to "gcc"*.  Also supports "icc" and "clang"*.

-		      You should provide the full path to the compiler or it

-		      should be in the user's path.

-

-options_host:	      Additional options for the host compiler.

-

-options_target:       Additional options for the target compiler.

-

-To use any of the options above, simple add <option_name>=<value>.  For

-example, if you want to build with icc instead of gcc, type:

-

-make compiler_host=icc compiler_target=icc

-

-

-Supported RTL Build Configurations

-==================================

-

-Supported Architectures: Intel(R) 64, and Intel(R) Many Integrated

-Core Architecture

-

-	      ---------------------------------------------

-	      |   icc/icl     |    gcc      |    clang    |

---------------|---------------|---------------------------|

-| Linux* OS   |      Yes      |     Yes(1)  |     Yes(1)  |

-| OS X*       |       No      |      No     |      No     |

-| Windows* OS |       No      |      No     |      No     |

------------------------------------------------------------

-

-(1) Liboffload requires _rdtsc intrinsic, which may be unsupported by some

-    versions of compiler.  In this case you need to include src/rdtsc.h

-    manually by using Makefile options options_host and options_target:

-

-    make options_host="-include src/rdtsc.h" options_target="-include src/rdtsc.h"

-

------------------------------------------------------------------------

-

-Notices

-=======

-

-*Other names and brands may be claimed as the property of others.

diff --git a/offload/doc/Reference.pdf b/offload/doc/Reference.pdf
deleted file mode 100644
index b9176f0..0000000
--- a/offload/doc/Reference.pdf
+++ /dev/null
Binary files differ
diff --git a/offload/doc/doxygen/config b/offload/doc/doxygen/config
deleted file mode 100755
index 275258f..0000000
--- a/offload/doc/doxygen/config
+++ /dev/null
@@ -1,2328 +0,0 @@
-# Doxyfile 1.8.6
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = "Intel&reg;&nbsp;Offload Runtime Library"
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         = 
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          = 
-
-# With the PROJECT_LOGO tag one can specify an logo or icon that is included in
-# the documentation. The maximum height of the logo should not exceed 55 pixels
-# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo
-# to the output directory.
-
-PROJECT_LOGO           = 
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = doc/doxygen/generated
-
-# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       = "The $name class" \
-                         "The $name widget" \
-                         "The $name file" \
-                         is \
-                         provides \
-                         specifies \
-                         contains \
-                         represents \
-                         a \
-                         an \
-                         the
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        = src/
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    = src/
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a
-# new page for each member. If set to NO, the documentation of a member will be
-# part of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 8
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
-
-ALIASES                = 
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              = 
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL. For instance to make
-# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
-# (default is Fortran), use: inc=Fortran f=C.
-#
-# Note For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      = 
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by by putting a % sign in front of the word
-# or globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = NO
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES, then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = YES
-
-# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = YES
-
-# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = YES
-
-# If the EXTRACT_STATIC tag is set to YES all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = YES
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. When set to YES local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO these classes will be included in the various overviews. This option has
-# no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = NO
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = YES
-
-
-SHOW_GROUPED_MEMB_INC  = NO
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO the members will appear in declaration order. Note that
-# this will also influence the order of the classes in the class list.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the
-# todo list. This list is created by putting \todo commands in the
-# documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the
-# test list. This list is created by putting \test commands in the
-# documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       = 
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES the list
-# will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    = 
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            = 
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. Do not use file names with spaces, bibtex cannot handle them. See
-# also \cite for info how to create references.
-
-CITE_BIB_FILES         = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = YES
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO doxygen will only warn about wrong or incomplete parameter
-# documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces.
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  = src
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank the
-# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
-# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
-# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
-# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
-# *.qsf, *.as and *.js.
-
-FILE_PATTERNS          = *.c *.h *.cpp *.f90
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = YES
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                = src/imported src/rdtsc.h
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       = 
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS        = 
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           = 
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       = *
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             = 
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-
-INPUT_FILTER           = 
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-
-FILTER_PATTERNS        = 
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER ) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS = 
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = YES
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = YES
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES, then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-# If the CLANG_ASSISTED_PARSING tag is set to YES, then doxygen will use the
-# clang parser (see: http://clang.llvm.org/) for more acurate parsing at the
-# cost of reduced performance. This can be particularly helpful with template
-# rich C++ code for which doxygen's built-in parser lacks the necessary type
-# information.
-# Note: The availability of this option depends on whether or not doxygen was
-# compiled with the --with-libclang option.
-# The default value is: NO.
-
-CLANG_ASSISTED_PARSING = NO
-
-# If clang assisted parsing is enabled you can provide the compiler with command
-# line options that you would normally use when invoking the compiler. Note that
-# the include paths will already be set by doxygen for the files and directories
-# specified with INPUT and INCLUDE_PATH.
-# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
-
-CLANG_OPTIONS          = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot of
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = YES
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = YES
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            = 
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            = 
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        = 
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional user-
-# defined cascading style sheet that is included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefor more robust against future updates.
-# Doxygen will copy the style sheet file to the output directory. For an example
-# see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  = 
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       = 
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the stylesheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = NO
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               = 
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler ( hhc.exe). If non-empty
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           = 
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated (
-# YES) or that it should be included in the master .chm file ( NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     = 
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated (
-# YES) or a normal table of contents ( NO) in the .chm file.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               = 
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   = 
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  = 
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  = 
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           = 
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = NO
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using prerendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = NO
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     = 
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       = 
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavours of web server based searching depending on the
-# EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for
-# searching and an index file used by the script. When EXTERNAL_SEARCH is
-# enabled the indexing and searching needs to be provided by external tools. See
-# the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer ( doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer ( doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       = 
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     = 
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES doxygen will generate LaTeX output.
-# The default value is: YES.
-
-GENERATE_LATEX         = YES
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked.
-#
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
-# index for LaTeX.
-# The default file is: makeindex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES doxygen generates more compact LaTeX
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used by the
-# printer.
-# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
-# 14 inches) and executive (7.25 x 10.5 inches).
-# The default value is: a4.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PAPER_TYPE             = a4
-
-# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. To get the times font for
-# instance you can specify
-# EXTRA_PACKAGES=times
-# If left blank no extra packages will be included.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-EXTRA_PACKAGES         = 
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
-#
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber. Doxygen will
-# replace them by respectively the title of the page, the current date and time,
-# only the current date, the version number of doxygen, the project name (see
-# PROJECT_NAME), or the project number (see PROJECT_NUMBER).
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HEADER           = doc/doxygen/header.tex
-
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_FOOTER           = 
-
-# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the LATEX_OUTPUT output
-# directory. Note that the files will be copied as-is; there are no commands or
-# markers available.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_FILES      = 
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
-# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
-# contain links (just like the HTML output) instead of page references. This
-# makes the output suitable for online browsing using a PDF viewer.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PDF_HYPERLINKS         = YES
-
-# If the LATEX_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES to get a
-# higher quality PDF documentation.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BATCHMODE        = NO
-
-# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
-# index chapters (such as File Index, Compound Index, etc.) in the output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HIDE_INDICES     = NO
-
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
-# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
-# bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
-# The default value is: plain.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BIB_STYLE        = plain
-
-#---------------------------------------------------------------------------
-# Configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES doxygen will generate RTF output. The
-# RTF output is optimized for Word 97 and may not look too pretty with other RTF
-# readers/editors.
-# The default value is: NO.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: rtf.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES doxygen generates more compact RTF
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
-# contain hyperlink fields. The RTF file will contain links (just like the HTML
-# output) instead of page references. This makes the output suitable for online
-# browsing using Word or some other Word compatible readers that support those
-# fields.
-#
-# Note: WordPad (write) and others do not support links.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
-#
-# See also section "Doxygen usage" for information on how to generate the
-# default style sheet that doxygen normally uses.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_STYLESHEET_FILE    = 
-
-# Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_EXTENSIONS_FILE    = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES doxygen will generate man pages for
-# classes and files.
-# The default value is: NO.
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it. A directory man3 will be created inside the directory specified by
-# MAN_OUTPUT.
-# The default directory is: man.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
-# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
-# optional.
-# The default value is: .3.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_EXTENSION          = .3
-
-# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
-# them the man command would be unable to find the correct page.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = NO
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# The XML_SCHEMA tag can be used to specify a XML schema, which can be used by a
-# validating XML parser to check the syntax of the XML files.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_SCHEMA             = 
-
-# The XML_DTD tag can be used to specify a XML DTD, which can be used by a
-# validating XML parser to check the syntax of the XML files.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_DTD                = 
-
-# If the XML_PROGRAMLISTING tag is set to YES doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the DOCBOOK output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_DOCBOOK tag is set to YES doxygen will generate Docbook files
-# that can be used to generate PDF.
-# The default value is: NO.
-
-GENERATE_DOCBOOK       = NO
-
-# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
-# front of it.
-# The default directory is: docbook.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_OUTPUT         = docbook
-
-#---------------------------------------------------------------------------
-# Configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES doxygen will generate an AutoGen
-# Definitions (see http://autogen.sf.net) file that captures the structure of
-# the code including all documentation. Note that this feature is still
-# experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES doxygen will generate a Perl module
-# file that captures the structure of the code including all documentation.
-#
-# Note that this feature is still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES doxygen will generate the necessary
-# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
-# output from the Perl module output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be nicely
-# formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO the
-# size of the Perl module output will be much smaller and Perl will parse it
-# just the same.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file are
-# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
-# so different doxyrules.make files included by the same Makefile don't
-# overwrite each other's variables.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_MAKEVAR_PREFIX = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES doxygen will expand all macro names
-# in the source code. If set to NO only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = YES
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = YES
-
-# If the SEARCH_INCLUDES tag is set to YES the includes files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           = 
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  = 
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             = COI_LIBRARY_VERSION=2 -DMYO_SUPPORT -DOFFLOAD_DEBUG=1 -DSEP_SUPPORT -DTIMING_SUPPORT
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      = 
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all references to function-like macros that are alone on a line, have an
-# all uppercase name, and do not end with a semicolon. Such function macros are
-# typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have an unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               = 
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       = 
-
-# If the ALLEXTERNALS tag is set to YES all external class will be listed in the
-# class index. If set to NO only the inherited external classes will be listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed in
-# the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            = 
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               = 
-
-# If set to YES, the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: NO.
-
-HAVE_DOT               = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font n the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           = 
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot.
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, jpg, gif and svg.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = NO
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               = 
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           = 
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           = 
-
-# The DIAFILE_DIRS tag can be used to specify one or more directories that
-# contain dia files that are included in the documentation (see the \diafile
-# command).
-
-DIAFILE_DIRS           = 
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
diff --git a/offload/doc/doxygen/header.tex b/offload/doc/doxygen/header.tex
deleted file mode 100755
index 5e963c2..0000000
--- a/offload/doc/doxygen/header.tex
+++ /dev/null
@@ -1,90 +0,0 @@
-% Latex header for doxygen 1.8.3.1

-\documentclass{book}

-\usepackage[a4paper,top=2.5cm,bottom=2.5cm,left=2.5cm,right=2.5cm]{geometry}

-\usepackage{makeidx}

-\usepackage{natbib}

-\usepackage{graphicx}

-\usepackage{multicol}

-\usepackage{float}

-\usepackage{listings}

-\usepackage{color}

-\usepackage{ifthen}

-\usepackage[table]{xcolor}

-\usepackage{textcomp}

-\usepackage{alltt}

-\usepackage{ifpdf}

-\ifpdf

-\usepackage[pdftex,

-            pagebackref=true,

-            colorlinks=true,

-            linkcolor=blue,

-            unicode

-           ]{hyperref}

-\else

-\usepackage[ps2pdf,

-            pagebackref=true,

-            colorlinks=true,

-            linkcolor=blue,

-            unicode

-           ]{hyperref}

-\usepackage{pspicture}

-\fi

-\usepackage[utf8]{inputenc}

-\usepackage{mathptmx}

-\usepackage[scaled=.90]{helvet}

-\usepackage{courier}

-\usepackage{sectsty}

-\usepackage{amssymb}

-\usepackage[titles]{tocloft}

-\usepackage{doxygen}

-\usepackage{fancyhdr}

-\pagestyle{fancy}

-\lstset{language=C++,inputencoding=utf8,basicstyle=\footnotesize,breaklines=true,breakatwhitespace=true,tabsize=4,numbers=left }

-\makeindex

-\setcounter{tocdepth}{3}

-\renewcommand{\footrulewidth}{0.4pt}

-\renewcommand{\familydefault}{\sfdefault}

-\hfuzz=15pt

-\setlength{\emergencystretch}{15pt}

-\hbadness=750

-\tolerance=750

-\begin{document}

-\hypersetup{pageanchor=false,citecolor=blue}

-\begin{titlepage}

-\vspace*{7cm}

-\begin{center}

-{\Large Intel\textsuperscript{\textregistered} Offload Runtime Library }\\

-\vspace*{1cm}

-{\large Generated by Doxygen $doxygenversion }\\

-\vspace*{0.5cm}

-{\small $datetime }\\

-\end{center}

-\end{titlepage}

-

-{\bf FTC Optimization Notice}

-

-Intel's compilers may or may not optimize to the same degree for non-Intel microprocessors for

-optimizations that are not unique to Intel microprocessors. These optimizations include SSE2,

-SSE3, and SSSE3 instruction sets and other optimizations. Intel does not guarantee the

-availability, functionality, or effectiveness of any optimization on microprocessors not

-manufactured by Intel.

-

-Microprocessor-dependent optimizations in this product are intended for use with Intel

-microprocessors. Certain optimizations not specific to Intel microarchitecture are reserved for

-Intel microprocessors. Please refer to the applicable product User and Reference Guides for

-more information regarding the specific instruction sets covered by this notice.

-

-Notice revision \#20110804

-

-\vspace*{0.5cm}

-

-{\bf Trademarks}

-

-Intel, Xeon, and Intel Xeon Phi are trademarks of Intel Corporation in the U.S. and/or other countries.

-

-This document is Copyright \textcopyright 2014, Intel Corporation. All rights reserved. 

-

-\pagenumbering{roman}

-\tableofcontents

-\pagenumbering{arabic}

-\hypersetup{pageanchor=true,citecolor=blue}

diff --git a/offload/src/cean_util.cpp b/offload/src/cean_util.cpp
deleted file mode 100644
index fe1890b..0000000
--- a/offload/src/cean_util.cpp
+++ /dev/null
@@ -1,344 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "cean_util.h"
-#include "offload_common.h"
-
-// 1. allocate element of CeanReadRanges type
-// 2. initialized it for reading consequently contiguous ranges
-//    described by "ap" argument
-CeanReadRanges * init_read_ranges_arr_desc(const arr_desc *ap)
-{
-    CeanReadRanges * res;
-
-    // find the max contiguous range
-    int64_t rank = ap->rank - 1;
-    int64_t length = ap->dim[rank].size;
-    for (; rank >= 0; rank--) {
-        if (ap->dim[rank].stride == 1) {
-            length *= (ap->dim[rank].upper - ap->dim[rank].lower + 1);
-            if (rank > 0 && length != ap->dim[rank - 1].size) {
-                break;
-            }
-        }
-        else {
-            break;
-        }
-    }
-
-    res =(CeanReadRanges *)malloc(sizeof(CeanReadRanges) +
-                                  (ap->rank - rank) * sizeof(CeanReadDim));
-    res->current_number = 0;
-    res->range_size = length;
-    res->last_noncont_ind = rank;
-
-    // calculate number of contiguous ranges inside noncontiguous dimensions
-    int count = 1;
-    bool prev_is_cont = true;
-    int64_t offset = 0;
-
-    for (; rank >= 0; rank--) {
-        res->Dim[rank].count = count;
-        res->Dim[rank].size = ap->dim[rank].stride * ap->dim[rank].size;
-        count *= (prev_is_cont && ap->dim[rank].stride == 1? 1 :
-            (ap->dim[rank].upper - ap->dim[rank].lower +
-            ap->dim[rank].stride) / ap->dim[rank].stride);
-        prev_is_cont = false;
-        offset +=(ap->dim[rank].lower - ap->dim[rank].lindex) *
-                 ap->dim[rank].size;
-    }
-    res->range_max_number = count;
-    res -> ptr = (void*)ap->base;
-    res -> init_offset = offset;
-    return res;
-}
-
-// check if ranges described by 1 argument could be transferred into ranges
-// described by 2-nd one
-bool cean_ranges_match(
-    CeanReadRanges * read_rng1,
-    CeanReadRanges * read_rng2
-)
-{
-    return ( read_rng1 == NULL || read_rng2 == NULL ||
-            (read_rng1->range_size % read_rng2->range_size == 0 ||
-            read_rng2->range_size % read_rng1->range_size == 0));
-}
-
-// Set next offset and length and returns true for next range.
-// Returns false if the ranges are over.
-bool get_next_range(
-    CeanReadRanges * read_rng,
-    int64_t *offset
-)
-{
-    if (++read_rng->current_number > read_rng->range_max_number) {
-        read_rng->current_number = 0;
-        return false;
-    }
-    int rank = 0;
-    int num = read_rng->current_number - 1;
-    int64_t cur_offset = 0;
-    int num_loc;
-    for (; rank <= read_rng->last_noncont_ind; rank++) {
-        num_loc = num / read_rng->Dim[rank].count;
-        cur_offset += num_loc * read_rng->Dim[rank].size;
-        num = num % read_rng->Dim[rank].count;
-    }
-    *offset = cur_offset + read_rng->init_offset;
-    return true;
-}
-
-bool is_arr_desc_contiguous(const arr_desc *ap)
-{
-    int64_t rank = ap->rank - 1;
-    int64_t length = ap->dim[rank].size;
-    for (; rank >= 0; rank--) {
-        if (ap->dim[rank].stride > 1 &&
-            ap->dim[rank].upper - ap->dim[rank].lower != 0) {
-                return false;
-        }
-        else if (length != ap->dim[rank].size) {
-            for (; rank >= 0; rank--) {
-                if (ap->dim[rank].upper - ap->dim[rank].lower != 0) {
-                    return false;
-                }
-            }
-            return true;
-        }
-        length *= (ap->dim[rank].upper - ap->dim[rank].lower + 1);
-    }
-    return true;
-}
-
-int64_t cean_get_transf_size(CeanReadRanges * read_rng)
-{
-    return(read_rng->range_max_number * read_rng->range_size);
-}
-
-static uint64_t last_left, last_right;
-typedef void (*fpp)(const char *spaces, uint64_t low, uint64_t high, int esize);
-
-static void generate_one_range(
-    const char *spaces,
-    uint64_t lrange,
-    uint64_t rrange,
-    fpp fp,
-    int esize
-)
-{
-    OFFLOAD_TRACE(3,
-        "%s    generate_one_range(lrange=%p, rrange=%p, esize=%d)\n",
-        spaces, (void*)lrange, (void*)rrange, esize);
-    if (last_left == -1) {
-        // First range
-        last_left = lrange;
-    }
-    else {
-        if (lrange == last_right+1) {
-            // Extend previous range, don't print
-        }
-        else {
-            (*fp)(spaces, last_left, last_right, esize);
-            last_left = lrange;
-        }
-    }
-    last_right = rrange;
-}
-
-static void generate_mem_ranges_one_rank(
-    const char *spaces,
-    uint64_t base,
-    uint64_t rank,
-    const struct dim_desc *ddp,
-    fpp fp,
-    int esize
-)
-{
-    uint64_t lindex = ddp->lindex;
-    uint64_t lower = ddp->lower;
-    uint64_t upper = ddp->upper;
-    uint64_t stride = ddp->stride;
-    uint64_t size = ddp->size;
-    OFFLOAD_TRACE(3,
-        "%s    "
-        "generate_mem_ranges_one_rank(base=%p, rank=%lld, lindex=%lld, "
-        "lower=%lld, upper=%lld, stride=%lld, size=%lld, esize=%d)\n",
-        spaces, (void*)base, rank, lindex, lower, upper, stride, size, esize);
-    if (rank == 1) {
-        uint64_t lrange, rrange;
-        if (stride == 1) {
-            lrange = base + (lower-lindex)*size;
-            rrange = lrange + (upper-lower+1)*size - 1;
-            generate_one_range(spaces, lrange, rrange, fp, esize);
-        }
-        else {
-            for (int i=lower-lindex; i<=upper-lindex; i+=stride) {
-                lrange = base + i*size;
-                rrange = lrange + size - 1;
-                generate_one_range(spaces, lrange, rrange, fp, esize);
-            }
-        }
-    }
-    else {
-        for (int i=lower-lindex; i<=upper-lindex; i+=stride) {
-            generate_mem_ranges_one_rank(
-                spaces, base+i*size, rank-1, ddp+1, fp, esize);
-
-        }
-    }
-}
-
-static void generate_mem_ranges(
-    const char *spaces,
-    const arr_desc *adp,
-    bool deref,
-    fpp fp
-)
-{
-    uint64_t esize;
-
-    OFFLOAD_TRACE(3,
-        "%s    "
-        "generate_mem_ranges(adp=%p, deref=%d, fp)\n",
-        spaces, adp, deref);
-    last_left = -1;
-    last_right = -2;
-
-    // Element size is derived from last dimension
-    esize = adp->dim[adp->rank-1].size;
-
-    generate_mem_ranges_one_rank(
-        // For c_cean_var the base addr is the address of the data
-        // For c_cean_var_ptr the base addr is dereferenced to get to the data
-        spaces, deref ? *((uint64_t*)(adp->base)) : adp->base,
-        adp->rank, &adp->dim[0], fp, esize);
-    (*fp)(spaces, last_left, last_right, esize);
-}
-
-// returns offset and length of the data to be transferred
-void __arr_data_offset_and_length(
-    const arr_desc *adp,
-    int64_t &offset,
-    int64_t &length
-)
-{
-    int64_t rank = adp->rank - 1;
-    int64_t size = adp->dim[rank].size;
-    int64_t r_off = 0; // offset from right boundary
-
-    // find the rightmost dimension which takes just part of its
-    // range. We define it if the size of left rank is not equal
-    // the range's length between upper and lower boungaries
-    while (rank > 0) {
-        size *= (adp->dim[rank].upper - adp->dim[rank].lower + 1);
-        if (size != adp->dim[rank - 1].size) {
-            break;
-        }
-        rank--;
-    }
-
-    offset = (adp->dim[rank].lower - adp->dim[rank].lindex) *
-             adp->dim[rank].size;
-
-    // find gaps both from the left - offset and from the right - r_off
-    for (rank--; rank >= 0; rank--) {
-        offset += (adp->dim[rank].lower - adp->dim[rank].lindex) *
-                  adp->dim[rank].size;
-        r_off += adp->dim[rank].size -
-                 (adp->dim[rank + 1].upper - adp->dim[rank + 1].lindex + 1) *
-                 adp->dim[rank + 1].size;
-    }
-    length = (adp->dim[0].upper - adp->dim[0].lindex + 1) *
-             adp->dim[0].size - offset - r_off;
-}
-
-#if OFFLOAD_DEBUG > 0
-
-void print_range(
-    const char *spaces,
-    uint64_t low,
-    uint64_t high,
-    int esize
-)
-{
-    char buffer[1024];
-    char number[32];
-
-    OFFLOAD_TRACE(3, "%s        print_range(low=%p, high=%p, esize=%d)\n",
-        spaces, (void*)low, (void*)high, esize);
-
-    if (console_enabled < 4) {
-        return;
-    }
-    OFFLOAD_TRACE(4, "%s            values:\n", spaces);
-    int count = 0;
-    buffer[0] = '\0';
-    while (low <= high)
-    {
-        switch (esize)
-        {
-        case 1:
-            sprintf(number, "%d ", *((char *)low));
-            low += 1;
-            break;
-        case 2:
-            sprintf(number, "%d ", *((short *)low));
-            low += 2;
-            break;
-        case 4:
-            sprintf(number, "%d ", *((int *)low));
-            low += 4;
-            break;
-        default:
-            sprintf(number, "0x%016x ", *((uint64_t *)low));
-            low += 8;
-            break;
-        }
-        strcat(buffer, number);
-        count++;
-        if (count == 10) {
-            OFFLOAD_TRACE(4, "%s            %s\n", spaces, buffer);
-            count = 0;
-            buffer[0] = '\0';
-        }
-    }
-    if (count != 0) {
-        OFFLOAD_TRACE(4, "%s            %s\n", spaces, buffer);
-    }
-}
-
-void __arr_desc_dump(
-    const char *spaces,
-    const char *name,
-    const arr_desc *adp,
-    bool deref
-)
-{
-    OFFLOAD_TRACE(2, "%s%s CEAN expression %p\n", spaces, name, adp);
-
-    if (adp != 0) {
-        OFFLOAD_TRACE(2, "%s    base=%llx, rank=%lld\n",
-            spaces, adp->base, adp->rank);
-
-        for (int i = 0; i < adp->rank; i++) {
-            OFFLOAD_TRACE(2,
-                          "%s    dimension %d: size=%lld, lindex=%lld, "
-                          "lower=%lld, upper=%lld, stride=%lld\n",
-                          spaces, i, adp->dim[i].size, adp->dim[i].lindex,
-                          adp->dim[i].lower, adp->dim[i].upper,
-                          adp->dim[i].stride);
-        }
-        // For c_cean_var the base addr is the address of the data
-        // For c_cean_var_ptr the base addr is dereferenced to get to the data
-        generate_mem_ranges(spaces, adp, deref, &print_range);
-    }
-}
-#endif // OFFLOAD_DEBUG
diff --git a/offload/src/cean_util.h b/offload/src/cean_util.h
deleted file mode 100644
index d0debcc..0000000
--- a/offload/src/cean_util.h
+++ /dev/null
@@ -1,101 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef CEAN_UTIL_H_INCLUDED
-#define CEAN_UTIL_H_INCLUDED
-
-#if MPSS_VERSION > 33
-#include <source/COIBuffer_source.h>
-#endif
-#include <stdint.h>
-
-#if MPSS_VERSION <= 33
-// CEAN expression representation
-struct dim_desc {
-    int64_t size;       // Length of data type
-    int64_t lindex;     // Lower index
-    int64_t lower;      // Lower section bound
-    int64_t upper;      // Upper section bound
-    int64_t stride;     // Stride
-};
-
-struct arr_desc {
-    int64_t base;       // Base address
-    int64_t rank;       // Rank of array
-    dim_desc dim[1];
-};
-#endif
-
-struct CeanReadDim {
-    int64_t count; // The number of elements in this dimension
-    int64_t size;  // The number of bytes between successive
-                   // elements in this dimension.
-};
-
-struct CeanReadRanges {
-    void *  ptr;
-    int64_t current_number;   // the number of ranges read
-    int64_t range_max_number; // number of contiguous ranges
-    int64_t range_size;       // size of max contiguous range
-    int     last_noncont_ind; // size of Dim array
-    int64_t init_offset;      // offset of 1-st element from array left bound
-    CeanReadDim Dim[1];
-};
-
-// array descriptor length
-#define __arr_desc_length(rank) \
-    (sizeof(int64_t) + sizeof(dim_desc) * (rank))
-
-// returns offset and length of the data to be transferred
-void __arr_data_offset_and_length(const arr_desc *adp,
-                                  int64_t &offset,
-                                  int64_t &length);
-
-// define if data array described by argument is contiguous one
-bool is_arr_desc_contiguous(const arr_desc *ap);
-
-// allocate element of CeanReadRanges type initialized
-// to read consequently contiguous ranges described by "ap" argument
-CeanReadRanges * init_read_ranges_arr_desc(const arr_desc *ap);
-
-// check if ranges described by 1 argument could be transferred into ranges
-// described by 2-nd one
-bool cean_ranges_match(
-    CeanReadRanges * read_rng1,
-    CeanReadRanges * read_rng2
-);
-
-// first argument - returned value by call to init_read_ranges_arr_desc.
-// returns true if offset and length of next range is set successfuly.
-// returns false if the ranges is over.
-bool get_next_range(
-    CeanReadRanges * read_rng,
-    int64_t *offset
-);
-
-// returns number of transferred bytes
-int64_t cean_get_transf_size(CeanReadRanges * read_rng);
-
-#if OFFLOAD_DEBUG > 0
-// prints array descriptor contents to stderr
-void    __arr_desc_dump(
-    const char *spaces,
-    const char *name,
-    const arr_desc *adp,
-    bool dereference);
-#else
-#define __arr_desc_dump(
-    spaces,
-    name,
-    adp,
-    dereference)
-#endif // OFFLOAD_DEBUG
-
-#endif // CEAN_UTIL_H_INCLUDED
diff --git a/offload/src/coi/coi_client.cpp b/offload/src/coi/coi_client.cpp
deleted file mode 100644
index ab8c7f5..0000000
--- a/offload/src/coi/coi_client.cpp
+++ /dev/null
@@ -1,350 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-// The COI host interface
-
-#include "coi_client.h"
-#include "../offload_common.h"
-
-namespace COI {
-
-#define COI_VERSION1    "COI_1.0"
-#define COI_VERSION2    "COI_2.0"
-
-bool            is_available;
-static void*    lib_handle;
-
-// pointers to functions from COI library
-COIRESULT (*EngineGetCount)(COI_ISA_TYPE, uint32_t*);
-COIRESULT (*EngineGetHandle)(COI_ISA_TYPE, uint32_t, COIENGINE*);
-
-COIRESULT (*ProcessCreateFromMemory)(COIENGINE, const char*, const void*,
-                                     uint64_t, int, const char**, uint8_t,
-                                     const char**, uint8_t, const char*,
-                                     uint64_t, const char*, const char*,
-                                     uint64_t, COIPROCESS*);
-COIRESULT (*ProcessDestroy)(COIPROCESS, int32_t, uint8_t, int8_t*, uint32_t*);
-COIRESULT (*ProcessGetFunctionHandles)(COIPROCESS, uint32_t, const char**,
-                                       COIFUNCTION*);
-COIRESULT (*ProcessLoadLibraryFromMemory)(COIPROCESS, const void*, uint64_t,
-                                          const char*, const char*,
-                                          const char*, uint64_t, uint32_t,
-                                          COILIBRARY*);
-COIRESULT (*ProcessRegisterLibraries)(uint32_t, const void**, const uint64_t*,
-                                      const char**, const uint64_t*);
-
-COIRESULT (*PipelineCreate)(COIPROCESS, COI_CPU_MASK, uint32_t, COIPIPELINE*);
-COIRESULT (*PipelineDestroy)(COIPIPELINE);
-COIRESULT (*PipelineRunFunction)(COIPIPELINE, COIFUNCTION, uint32_t,
-                                 const COIBUFFER*, const COI_ACCESS_FLAGS*,
-                                 uint32_t, const COIEVENT*, const void*,
-                                 uint16_t, void*, uint16_t, COIEVENT*);
-
-COIRESULT (*BufferCreate)(uint64_t, COI_BUFFER_TYPE, uint32_t, const void*,
-                          uint32_t, const COIPROCESS*, COIBUFFER*);
-COIRESULT (*BufferCreateFromMemory)(uint64_t, COI_BUFFER_TYPE, uint32_t,
-                                    void*, uint32_t, const COIPROCESS*,
-                                    COIBUFFER*);
-COIRESULT (*BufferDestroy)(COIBUFFER);
-COIRESULT (*BufferMap)(COIBUFFER, uint64_t, uint64_t, COI_MAP_TYPE, uint32_t,
-                       const COIEVENT*, COIEVENT*, COIMAPINSTANCE*, void**);
-COIRESULT (*BufferUnmap)(COIMAPINSTANCE, uint32_t, const COIEVENT*, COIEVENT*);
-COIRESULT (*BufferWrite)(COIBUFFER, uint64_t, const void*, uint64_t,
-                         COI_COPY_TYPE, uint32_t, const COIEVENT*, COIEVENT*);
-COIRESULT (*BufferRead)(COIBUFFER, uint64_t, void*, uint64_t, COI_COPY_TYPE,
-                        uint32_t, const COIEVENT*, COIEVENT*);
-COIRESULT (*BufferCopy)(COIBUFFER, COIBUFFER, uint64_t, uint64_t, uint64_t,
-                        COI_COPY_TYPE, uint32_t, const COIEVENT*, COIEVENT*);
-COIRESULT (*BufferGetSinkAddress)(COIBUFFER, uint64_t*);
-COIRESULT (*BufferSetState)(COIBUFFER, COIPROCESS, COI_BUFFER_STATE,
-                            COI_BUFFER_MOVE_FLAG, uint32_t,
-                            const   COIEVENT*, COIEVENT*);
-
-COIRESULT (*EventWait)(uint16_t, const COIEVENT*, int32_t, uint8_t, uint32_t*,
-                       uint32_t*);
-
-uint64_t  (*PerfGetCycleFrequency)(void);
-
-bool init(void)
-{
-#ifndef TARGET_WINNT
-    const char *lib_name = "libcoi_host.so.0";
-#else // TARGET_WINNT
-    const char *lib_name = "coi_host.dll";
-#endif // TARGET_WINNT
-
-    OFFLOAD_DEBUG_TRACE(2, "Loading COI library %s ...\n", lib_name);
-    lib_handle = DL_open(lib_name);
-    if (lib_handle == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to load the library\n");
-        return false;
-    }
-
-    EngineGetCount =
-        (COIRESULT (*)(COI_ISA_TYPE, uint32_t*))
-            DL_sym(lib_handle, "COIEngineGetCount", COI_VERSION1);
-    if (EngineGetCount == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIEngineGetCount");
-        fini();
-        return false;
-    }
-
-    EngineGetHandle =
-        (COIRESULT (*)(COI_ISA_TYPE, uint32_t, COIENGINE*))
-            DL_sym(lib_handle, "COIEngineGetHandle", COI_VERSION1);
-    if (EngineGetHandle == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIEngineGetHandle");
-        fini();
-        return false;
-    }
-
-    ProcessCreateFromMemory =
-        (COIRESULT (*)(COIENGINE, const char*, const void*, uint64_t, int,
-                       const char**, uint8_t, const char**, uint8_t,
-                       const char*, uint64_t, const char*, const char*,
-                       uint64_t, COIPROCESS*))
-            DL_sym(lib_handle, "COIProcessCreateFromMemory", COI_VERSION1);
-    if (ProcessCreateFromMemory == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIProcessCreateFromMemory");
-        fini();
-        return false;
-    }
-
-    ProcessDestroy =
-        (COIRESULT (*)(COIPROCESS, int32_t, uint8_t, int8_t*,
-                       uint32_t*))
-            DL_sym(lib_handle, "COIProcessDestroy", COI_VERSION1);
-    if (ProcessDestroy == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIProcessDestroy");
-        fini();
-        return false;
-    }
-
-    ProcessGetFunctionHandles =
-        (COIRESULT (*)(COIPROCESS, uint32_t, const char**, COIFUNCTION*))
-            DL_sym(lib_handle, "COIProcessGetFunctionHandles", COI_VERSION1);
-    if (ProcessGetFunctionHandles == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIProcessGetFunctionHandles");
-        fini();
-        return false;
-    }
-
-    ProcessLoadLibraryFromMemory =
-        (COIRESULT (*)(COIPROCESS, const void*, uint64_t, const char*,
-                       const char*, const char*, uint64_t, uint32_t,
-                       COILIBRARY*))
-            DL_sym(lib_handle, "COIProcessLoadLibraryFromMemory", COI_VERSION2);
-    if (ProcessLoadLibraryFromMemory == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIProcessLoadLibraryFromMemory");
-        fini();
-        return false;
-    }
-
-    ProcessRegisterLibraries =
-        (COIRESULT (*)(uint32_t, const void**, const uint64_t*, const char**,
-                       const uint64_t*))
-            DL_sym(lib_handle, "COIProcessRegisterLibraries", COI_VERSION1);
-    if (ProcessRegisterLibraries == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIProcessRegisterLibraries");
-        fini();
-        return false;
-    }
-
-    PipelineCreate =
-        (COIRESULT (*)(COIPROCESS, COI_CPU_MASK, uint32_t, COIPIPELINE*))
-            DL_sym(lib_handle, "COIPipelineCreate", COI_VERSION1);
-    if (PipelineCreate == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIPipelineCreate");
-        fini();
-        return false;
-    }
-
-    PipelineDestroy =
-        (COIRESULT (*)(COIPIPELINE))
-            DL_sym(lib_handle, "COIPipelineDestroy", COI_VERSION1);
-    if (PipelineDestroy == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIPipelineDestroy");
-        fini();
-        return false;
-    }
-
-    PipelineRunFunction =
-        (COIRESULT (*)(COIPIPELINE, COIFUNCTION, uint32_t, const COIBUFFER*,
-                       const COI_ACCESS_FLAGS*, uint32_t, const COIEVENT*,
-                       const void*, uint16_t, void*, uint16_t, COIEVENT*))
-            DL_sym(lib_handle, "COIPipelineRunFunction", COI_VERSION1);
-    if (PipelineRunFunction == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIPipelineRunFunction");
-        fini();
-        return false;
-    }
-
-    BufferCreate =
-        (COIRESULT (*)(uint64_t, COI_BUFFER_TYPE, uint32_t, const void*,
-                       uint32_t, const COIPROCESS*, COIBUFFER*))
-            DL_sym(lib_handle, "COIBufferCreate", COI_VERSION1);
-    if (BufferCreate == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferCreate");
-        fini();
-        return false;
-    }
-
-    BufferCreateFromMemory =
-        (COIRESULT (*)(uint64_t, COI_BUFFER_TYPE, uint32_t, void*,
-                       uint32_t, const COIPROCESS*, COIBUFFER*))
-            DL_sym(lib_handle, "COIBufferCreateFromMemory", COI_VERSION1);
-    if (BufferCreateFromMemory == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferCreateFromMemory");
-        fini();
-        return false;
-    }
-
-    BufferDestroy =
-        (COIRESULT (*)(COIBUFFER))
-            DL_sym(lib_handle, "COIBufferDestroy", COI_VERSION1);
-    if (BufferDestroy == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferDestroy");
-        fini();
-        return false;
-    }
-
-    BufferMap =
-        (COIRESULT (*)(COIBUFFER, uint64_t, uint64_t, COI_MAP_TYPE, uint32_t,
-                       const COIEVENT*, COIEVENT*, COIMAPINSTANCE*,
-                       void**))
-            DL_sym(lib_handle, "COIBufferMap", COI_VERSION1);
-    if (BufferMap == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferMap");
-        fini();
-        return false;
-    }
-
-    BufferUnmap =
-        (COIRESULT (*)(COIMAPINSTANCE, uint32_t, const COIEVENT*,
-                       COIEVENT*))
-            DL_sym(lib_handle, "COIBufferUnmap", COI_VERSION1);
-    if (BufferUnmap == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferUnmap");
-        fini();
-        return false;
-    }
-
-    BufferWrite =
-        (COIRESULT (*)(COIBUFFER, uint64_t, const void*, uint64_t,
-                       COI_COPY_TYPE, uint32_t, const COIEVENT*,
-                       COIEVENT*))
-            DL_sym(lib_handle, "COIBufferWrite", COI_VERSION1);
-    if (BufferWrite == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferWrite");
-        fini();
-        return false;
-    }
-
-    BufferRead =
-        (COIRESULT (*)(COIBUFFER, uint64_t, void*, uint64_t,
-                                     COI_COPY_TYPE, uint32_t,
-                                     const COIEVENT*, COIEVENT*))
-            DL_sym(lib_handle, "COIBufferRead", COI_VERSION1);
-    if (BufferRead == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferRead");
-        fini();
-        return false;
-    }
-
-    BufferCopy =
-        (COIRESULT (*)(COIBUFFER, COIBUFFER, uint64_t, uint64_t, uint64_t,
-                       COI_COPY_TYPE, uint32_t, const COIEVENT*,
-                       COIEVENT*))
-            DL_sym(lib_handle, "COIBufferCopy", COI_VERSION1);
-    if (BufferCopy == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferCopy");
-        fini();
-        return false;
-    }
-
-    BufferGetSinkAddress =
-        (COIRESULT (*)(COIBUFFER, uint64_t*))
-            DL_sym(lib_handle, "COIBufferGetSinkAddress", COI_VERSION1);
-    if (BufferGetSinkAddress == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferGetSinkAddress");
-        fini();
-        return false;
-    }
-
-    BufferSetState =
-        (COIRESULT(*)(COIBUFFER, COIPROCESS, COI_BUFFER_STATE,
-                      COI_BUFFER_MOVE_FLAG, uint32_t, const COIEVENT*,
-                      COIEVENT*))
-            DL_sym(lib_handle, "COIBufferSetState", COI_VERSION1);
-    if (BufferSetState == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferSetState");
-        fini();
-        return false;
-    }
-
-    EventWait =
-        (COIRESULT (*)(uint16_t, const COIEVENT*, int32_t, uint8_t,
-                       uint32_t*, uint32_t*))
-            DL_sym(lib_handle, "COIEventWait", COI_VERSION1);
-    if (EventWait == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIEventWait");
-        fini();
-        return false;
-    }
-
-    PerfGetCycleFrequency =
-        (uint64_t (*)(void))
-            DL_sym(lib_handle, "COIPerfGetCycleFrequency", COI_VERSION1);
-    if (PerfGetCycleFrequency == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIPerfGetCycleFrequency");
-        fini();
-        return false;
-    }
-
-    is_available = true;
-
-    return true;
-}
-
-void fini(void)
-{
-    is_available = false;
-
-    if (lib_handle != 0) {
-#ifndef TARGET_WINNT
-        DL_close(lib_handle);
-#endif // TARGET_WINNT
-        lib_handle = 0;
-    }
-}
-
-} // namespace COI
diff --git a/offload/src/coi/coi_client.h b/offload/src/coi/coi_client.h
deleted file mode 100644
index 4775a8b..0000000
--- a/offload/src/coi/coi_client.h
+++ /dev/null
@@ -1,118 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-// The interface between offload library and the COI API on the host
-
-#ifndef COI_CLIENT_H_INCLUDED
-#define COI_CLIENT_H_INCLUDED
-
-#include <common/COIPerf_common.h>
-#include <source/COIEngine_source.h>
-#include <source/COIProcess_source.h>
-#include <source/COIPipeline_source.h>
-#include <source/COIBuffer_source.h>
-#include <source/COIEvent_source.h>
-
-#include <string.h>
-
-#include "../liboffload_error_codes.h"
-#include "../offload_util.h"
-
-#define MIC_ENGINES_MAX     128
-
-#if MIC_ENGINES_MAX < COI_MAX_ISA_MIC_DEVICES
-#error MIC_ENGINES_MAX need to be increased
-#endif
-
-// COI library interface
-namespace COI {
-
-extern bool init(void);
-extern void fini(void);
-
-extern bool is_available;
-
-// pointers to functions from COI library
-extern COIRESULT (*EngineGetCount)(COI_ISA_TYPE, uint32_t*);
-extern COIRESULT (*EngineGetHandle)(COI_ISA_TYPE, uint32_t, COIENGINE*);
-
-extern COIRESULT (*ProcessCreateFromMemory)(COIENGINE, const char*,
-                                           const void*, uint64_t, int,
-                                           const char**, uint8_t,
-                                           const char**, uint8_t,
-                                           const char*, uint64_t,
-                                           const char*,
-                                           const char*, uint64_t,
-                                           COIPROCESS*);
-extern COIRESULT (*ProcessDestroy)(COIPROCESS, int32_t, uint8_t,
-                                  int8_t*, uint32_t*);
-extern COIRESULT (*ProcessGetFunctionHandles)(COIPROCESS, uint32_t,
-                                             const char**,
-                                             COIFUNCTION*);
-extern COIRESULT (*ProcessLoadLibraryFromMemory)(COIPROCESS,
-                                                const void*,
-                                                uint64_t,
-                                                const char*,
-                                                const char*,
-                                                const char*,
-                                                uint64_t,
-                                                uint32_t,
-                                                COILIBRARY*);
-extern COIRESULT (*ProcessRegisterLibraries)(uint32_t,
-                                            const void**,
-                                            const uint64_t*,
-                                            const char**,
-                                            const uint64_t*);
-
-extern COIRESULT (*PipelineCreate)(COIPROCESS, COI_CPU_MASK, uint32_t,
-                                  COIPIPELINE*);
-extern COIRESULT (*PipelineDestroy)(COIPIPELINE);
-extern COIRESULT (*PipelineRunFunction)(COIPIPELINE, COIFUNCTION,
-                                       uint32_t, const COIBUFFER*,
-                                       const COI_ACCESS_FLAGS*,
-                                       uint32_t, const COIEVENT*,
-                                       const void*, uint16_t, void*,
-                                       uint16_t, COIEVENT*);
-
-extern COIRESULT (*BufferCreate)(uint64_t, COI_BUFFER_TYPE, uint32_t,
-                                const void*, uint32_t,
-                                const COIPROCESS*, COIBUFFER*);
-extern COIRESULT (*BufferCreateFromMemory)(uint64_t, COI_BUFFER_TYPE,
-                                          uint32_t, void*,
-                                          uint32_t, const COIPROCESS*,
-                                          COIBUFFER*);
-extern COIRESULT (*BufferDestroy)(COIBUFFER);
-extern COIRESULT (*BufferMap)(COIBUFFER, uint64_t, uint64_t,
-                             COI_MAP_TYPE, uint32_t, const COIEVENT*,
-                             COIEVENT*, COIMAPINSTANCE*, void**);
-extern COIRESULT (*BufferUnmap)(COIMAPINSTANCE, uint32_t,
-                               const COIEVENT*, COIEVENT*);
-extern COIRESULT (*BufferWrite)(COIBUFFER, uint64_t, const void*,
-                               uint64_t, COI_COPY_TYPE, uint32_t,
-                               const COIEVENT*, COIEVENT*);
-extern COIRESULT (*BufferRead)(COIBUFFER, uint64_t, void*, uint64_t,
-                              COI_COPY_TYPE, uint32_t,
-                              const COIEVENT*, COIEVENT*);
-extern COIRESULT (*BufferCopy)(COIBUFFER, COIBUFFER, uint64_t, uint64_t,
-                              uint64_t, COI_COPY_TYPE, uint32_t,
-                              const COIEVENT*, COIEVENT*);
-extern COIRESULT (*BufferGetSinkAddress)(COIBUFFER, uint64_t*);
-extern COIRESULT (*BufferSetState)(COIBUFFER, COIPROCESS, COI_BUFFER_STATE,
-                                   COI_BUFFER_MOVE_FLAG, uint32_t,
-                                   const   COIEVENT*, COIEVENT*);
-
-extern COIRESULT (*EventWait)(uint16_t, const COIEVENT*, int32_t,
-                           uint8_t, uint32_t*, uint32_t*);
-
-extern uint64_t  (*PerfGetCycleFrequency)(void);
-
-} // namespace COI
-
-#endif // COI_CLIENT_H_INCLUDED
diff --git a/offload/src/coi/coi_server.cpp b/offload/src/coi/coi_server.cpp
deleted file mode 100644
index 73e6c2d..0000000
--- a/offload/src/coi/coi_server.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-// The COI interface on the target
-
-#include "coi_server.h"
-
-#include "../offload_target.h"
-#include "../offload_timer.h"
-#ifdef MYO_SUPPORT
-#include "../offload_myo_target.h"      // for __offload_myoLibInit/Fini
-#endif // MYO_SUPPORT
-
-COINATIVELIBEXPORT
-void server_compute(
-    uint32_t  buffer_count,
-    void**    buffers,
-    uint64_t* buffers_len,
-    void*     misc_data,
-    uint16_t  misc_data_len,
-    void*     return_data,
-    uint16_t  return_data_len
-)
-{
-    OffloadDescriptor::offload(buffer_count, buffers,
-                               misc_data, misc_data_len,
-                               return_data, return_data_len);
-}
-
-COINATIVELIBEXPORT
-void server_init(
-    uint32_t  buffer_count,
-    void**    buffers,
-    uint64_t* buffers_len,
-    void*     misc_data,
-    uint16_t  misc_data_len,
-    void*     return_data,
-    uint16_t  return_data_len
-)
-{
-    struct init_data {
-        int  device_index;
-        int  devices_total;
-        int  console_level;
-        int  offload_report_level;
-    } *data = (struct init_data*) misc_data;
-
-    // set device index and number of total devices
-    mic_index = data->device_index;
-    mic_engines_total = data->devices_total;
-
-    // initialize trace level
-    console_enabled = data->console_level;
-    offload_report_level = data->offload_report_level;
-
-    // return back the process id
-    *((pid_t*) return_data) = getpid();
-}
-
-COINATIVELIBEXPORT
-void server_var_table_size(
-    uint32_t  buffer_count,
-    void**    buffers,
-    uint64_t* buffers_len,
-    void*     misc_data,
-    uint16_t  misc_data_len,
-    void*     return_data,
-    uint16_t  return_data_len
-)
-{
-    struct Params {
-        int64_t nelems;
-        int64_t length;
-    } *params;
-
-    params = static_cast<Params*>(return_data);
-    params->length = __offload_vars.table_size(params->nelems);
-}
-
-COINATIVELIBEXPORT
-void server_var_table_copy(
-    uint32_t  buffer_count,
-    void**    buffers,
-    uint64_t* buffers_len,
-    void*     misc_data,
-    uint16_t  misc_data_len,
-    void*     return_data,
-    uint16_t  return_data_len
-)
-{
-    __offload_vars.table_copy(buffers[0], *static_cast<int64_t*>(misc_data));
-}
-
-#ifdef MYO_SUPPORT
-// temporary workaround for blocking behavior of myoiLibInit/Fini calls
-COINATIVELIBEXPORT
-void server_myoinit(
-    uint32_t  buffer_count,
-    void**    buffers,
-    uint64_t* buffers_len,
-    void*     misc_data,
-    uint16_t  misc_data_len,
-    void*     return_data,
-    uint16_t  return_data_len
-)
-{
-    __offload_myoLibInit();
-}
-
-COINATIVELIBEXPORT
-void server_myofini(
-    uint32_t  buffer_count,
-    void**    buffers,
-    uint64_t* buffers_len,
-    void*     misc_data,
-    uint16_t  misc_data_len,
-    void*     return_data,
-    uint16_t  return_data_len
-)
-{
-    __offload_myoLibFini();
-}
-#endif // MYO_SUPPORT
diff --git a/offload/src/coi/coi_server.h b/offload/src/coi/coi_server.h
deleted file mode 100644
index e744d9e..0000000
--- a/offload/src/coi/coi_server.h
+++ /dev/null
@@ -1,74 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-//The interface between offload library and the COI API on the target.
-
-#ifndef COI_SERVER_H_INCLUDED
-#define COI_SERVER_H_INCLUDED
-
-#include <common/COIEngine_common.h>
-#include <common/COIPerf_common.h>
-#include <sink/COIProcess_sink.h>
-#include <sink/COIPipeline_sink.h>
-#include <sink/COIBuffer_sink.h>
-#include <list>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include "../liboffload_error_codes.h"
-
-// wrappers for COI API
-#define PipelineStartExecutingRunFunctions() \
-    { \
-        COIRESULT res = COIPipelineStartExecutingRunFunctions(); \
-        if (res != COI_SUCCESS) { \
-            LIBOFFLOAD_ERROR(c_pipeline_start_run_funcs, mic_index, res); \
-            exit(1); \
-        } \
-    }
-
-#define ProcessWaitForShutdown() \
-    { \
-        COIRESULT res = COIProcessWaitForShutdown(); \
-        if (res != COI_SUCCESS) { \
-            LIBOFFLOAD_ERROR(c_process_wait_shutdown, mic_index, res); \
-            exit(1); \
-        } \
-    }
-
-#define BufferAddRef(buf) \
-    { \
-        COIRESULT res = COIBufferAddRef(buf); \
-        if (res != COI_SUCCESS) { \
-            LIBOFFLOAD_ERROR(c_buf_add_ref, mic_index, res); \
-            exit(1); \
-        } \
-    }
-
-#define BufferReleaseRef(buf) \
-    { \
-        COIRESULT res = COIBufferReleaseRef(buf); \
-        if (res != COI_SUCCESS) { \
-            LIBOFFLOAD_ERROR(c_buf_release_ref, mic_index, res); \
-            exit(1); \
-        } \
-    }
-
-#define EngineGetIndex(index) \
-    { \
-        COI_ISA_TYPE isa_type; \
-        COIRESULT res = COIEngineGetIndex(&isa_type, index); \
-        if (res != COI_SUCCESS) { \
-            LIBOFFLOAD_ERROR(c_get_engine_index, mic_index, res); \
-            exit(1); \
-        } \
-    }
-
-#endif // COI_SERVER_H_INCLUDED
diff --git a/offload/src/compiler_if_host.cpp b/offload/src/compiler_if_host.cpp
deleted file mode 100644
index 2bc430b..0000000
--- a/offload/src/compiler_if_host.cpp
+++ /dev/null
@@ -1,323 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "compiler_if_host.h"
-
-#include <malloc.h>
-#ifndef TARGET_WINNT
-#include <alloca.h>
-#endif // TARGET_WINNT
-
-// Global counter on host. 
-// This variable is used if P2OPT_offload_do_data_persistence == 2.
-// The variable used to identify offload constructs contained in one procedure.
-// Increment of OFFLOAD_CALL_COUNT is inserted at entries of HOST routines with
-// offload constructs.
-static int offload_call_count = 0;
-
-extern "C" OFFLOAD OFFLOAD_TARGET_ACQUIRE(
-    TARGET_TYPE      target_type,
-    int              target_number,
-    int              is_optional,
-    _Offload_status* status,
-    const char*      file,
-    uint64_t         line
-)
-{
-    bool retval;
-    OFFLOAD ofld;
-
-    // initialize status
-    if (status != 0) {
-        status->result = OFFLOAD_UNAVAILABLE;
-        status->device_number = -1;
-        status->data_sent = 0;
-        status->data_received = 0;
-    }
-
-    // make sure libray is initialized
-    retval = __offload_init_library();
-
-    // OFFLOAD_TIMER_INIT must follow call to __offload_init_library
-    OffloadHostTimerData * timer_data = OFFLOAD_TIMER_INIT(file, line);
-
-    OFFLOAD_TIMER_START(timer_data, c_offload_host_total_offload);
-
-    OFFLOAD_TIMER_START(timer_data, c_offload_host_initialize);
-
-    // initialize all devices is init_type is on_offload_all
-    if (retval && __offload_init_type == c_init_on_offload_all) {
-        for (int i = 0; i < mic_engines_total; i++) {
-             mic_engines[i].init();
-        }
-    }
-    OFFLOAD_TIMER_STOP(timer_data, c_offload_host_initialize);
-
-    OFFLOAD_TIMER_START(timer_data, c_offload_host_target_acquire);
-
-    if (target_type == TARGET_HOST) {
-        // Host always available
-        retval = true;
-    }
-    else if (target_type == TARGET_MIC) {
-        if (target_number >= -1) {
-            if (retval) {
-                if (target_number >= 0) {
-                    // User provided the device number
-                    target_number = target_number % mic_engines_total;
-                }
-                else {
-                    // use device 0
-                    target_number = 0;
-                }
-
-                // reserve device in ORSL
-                if (is_optional) {
-                    if (!ORSL::try_reserve(target_number)) {
-                        target_number = -1;
-                    }
-                }
-                else {
-                    if (!ORSL::reserve(target_number)) {
-                        target_number = -1;
-                    }
-                }
-
-                // initialize device
-                if (target_number >= 0 &&
-                    __offload_init_type == c_init_on_offload) {
-                    OFFLOAD_TIMER_START(timer_data, c_offload_host_initialize);
-                    mic_engines[target_number].init();
-                    OFFLOAD_TIMER_STOP(timer_data, c_offload_host_initialize);
-                }
-            }
-            else {
-                // fallback to CPU
-                target_number = -1;
-            }
-
-            if (target_number < 0 || !retval) {
-                if (!is_optional && status == 0) {
-                    LIBOFFLOAD_ERROR(c_device_is_not_available);
-                    exit(1);
-                }
-
-                retval = false;
-            }
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_invalid_device_number);
-            exit(1);
-        }
-    }
-
-    if (retval) {
-        ofld = new OffloadDescriptor(target_number, status,
-                                     !is_optional, false, timer_data);
-        OFFLOAD_TIMER_HOST_MIC_NUM(timer_data, target_number);
-        Offload_Report_Prolog(timer_data);
-        OFFLOAD_DEBUG_TRACE_1(2, timer_data->offload_number, c_offload_start,
-                              "Starting offload: target_type = %d, "
-                              "number = %d, is_optional = %d\n",
-                              target_type, target_number, is_optional);
-
-        OFFLOAD_TIMER_STOP(timer_data, c_offload_host_target_acquire);
-    }
-    else {
-        ofld = NULL;
-
-        OFFLOAD_TIMER_STOP(timer_data, c_offload_host_target_acquire);
-        OFFLOAD_TIMER_STOP(timer_data, c_offload_host_total_offload);
-        offload_report_free_data(timer_data);
-    }
-
-    return ofld;
-}
-
-extern "C" OFFLOAD OFFLOAD_TARGET_ACQUIRE1(
-    const int*  device_num,
-    const char* file,
-    uint64_t    line
-)
-{
-    int target_number;
-
-    // make sure libray is initialized and at least one device is available
-    if (!__offload_init_library()) {
-        LIBOFFLOAD_ERROR(c_device_is_not_available);
-        exit(1);
-    }
-
-    // OFFLOAD_TIMER_INIT must follow call to __offload_init_library
-
-    OffloadHostTimerData * timer_data = OFFLOAD_TIMER_INIT(file, line);
-
-    OFFLOAD_TIMER_START(timer_data, c_offload_host_total_offload);
-
-    OFFLOAD_TIMER_START(timer_data, c_offload_host_initialize);
-
-    if (__offload_init_type == c_init_on_offload_all) {
-        for (int i = 0; i < mic_engines_total; i++) {
-             mic_engines[i].init();
-        }
-    }
-
-    OFFLOAD_TIMER_STOP(timer_data, c_offload_host_initialize);
-
-    OFFLOAD_TIMER_START(timer_data, c_offload_host_target_acquire);
-
-    // use default device number if it is not provided
-    if (device_num != 0) {
-        target_number = *device_num;
-    }
-    else {
-        target_number = __omp_device_num;
-    }
-
-    // device number should be a non-negative integer value
-    if (target_number < 0) {
-        LIBOFFLOAD_ERROR(c_omp_invalid_device_num);
-        exit(1);
-    }
-
-    // should we do this for OpenMP?
-    target_number %= mic_engines_total;
-
-    // reserve device in ORSL
-    if (!ORSL::reserve(target_number)) {
-        LIBOFFLOAD_ERROR(c_device_is_not_available);
-        exit(1);
-    }
-
-    // initialize device(s)
-    OFFLOAD_TIMER_START(timer_data, c_offload_host_initialize);
-
-    if (__offload_init_type == c_init_on_offload) {
-        mic_engines[target_number].init();
-    }
-
-    OFFLOAD_TIMER_STOP(timer_data, c_offload_host_initialize);
-
-    OFFLOAD ofld =
-        new OffloadDescriptor(target_number, 0, true, true, timer_data);
-
-    OFFLOAD_TIMER_HOST_MIC_NUM(timer_data, target_number);
-
-    Offload_Report_Prolog(timer_data);
-
-    OFFLOAD_DEBUG_TRACE_1(2, timer_data->offload_number, c_offload_start,
-                          "Starting OpenMP offload, device = %d\n",
-                          target_number);
-
-    OFFLOAD_TIMER_STOP(timer_data, c_offload_host_target_acquire);
-
-    return ofld;
-}
-
-int offload_offload_wrap(
-    OFFLOAD ofld,
-    const char *name,
-    int is_empty,
-    int num_vars,
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int num_waits,
-    const void **waits,
-    const void **signal,
-    int entry_id,
-    const void *stack_addr
-)
-{
-    bool ret = ofld->offload(name, is_empty, vars, vars2, num_vars,
-                             waits, num_waits, signal, entry_id, stack_addr);
-    if (!ret || signal == 0) {
-        delete ofld;
-    }
-    return ret;
-}
-
-extern "C" int OFFLOAD_OFFLOAD1(
-    OFFLOAD ofld,
-    const char *name,
-    int is_empty,
-    int num_vars,
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int num_waits,
-    const void **waits,
-    const void **signal
-)
-{
-    return offload_offload_wrap(ofld, name, is_empty,
-                            num_vars, vars, vars2,
-                            num_waits, waits,
-                            signal, NULL, NULL);
-}
-
-extern "C" int OFFLOAD_OFFLOAD2(
-    OFFLOAD ofld,
-    const char *name,
-    int is_empty,
-    int num_vars,
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int num_waits,
-    const void** waits,
-    const void** signal,
-    int entry_id,
-    const void *stack_addr
-)
-{
-    return offload_offload_wrap(ofld, name, is_empty,
-                            num_vars, vars, vars2,
-                            num_waits, waits,
-                            signal, entry_id, stack_addr);
-}
-
-extern "C" int OFFLOAD_OFFLOAD(
-    OFFLOAD ofld,
-    const char *name,
-    int is_empty,
-    int num_vars,
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int num_waits,
-    const void **waits,
-    const void *signal,
-    int entry_id,
-    const void *stack_addr
-)
-{
-    // signal is passed by reference now
-    const void **signal_new = (signal != 0) ? &signal : 0;
-    const void **waits_new = 0;
-    int num_waits_new = 0;
-
-    // remove NULL values from the list of signals to wait for
-    if (num_waits > 0) {
-        waits_new = (const void**) alloca(sizeof(void*) * num_waits);
-        for (int i = 0; i < num_waits; i++) {
-            if (waits[i] != 0) {
-                waits_new[num_waits_new++] = waits[i];
-            }
-        }
-    }
-
-    return OFFLOAD_OFFLOAD1(ofld, name, is_empty,
-                            num_vars, vars, vars2,
-                            num_waits_new, waits_new,
-                            signal_new);
-}
-
-extern "C" int OFFLOAD_CALL_COUNT()
-{
-    offload_call_count++;
-    return offload_call_count;
-}
diff --git a/offload/src/compiler_if_host.h b/offload/src/compiler_if_host.h
deleted file mode 100644
index 4b34c51..0000000
--- a/offload/src/compiler_if_host.h
+++ /dev/null
@@ -1,133 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-/*! \file
-    \brief The interface between compiler-generated host code and runtime library
-*/
-
-#ifndef COMPILER_IF_HOST_H_INCLUDED
-#define COMPILER_IF_HOST_H_INCLUDED
-
-#include "offload_host.h"
-
-#define OFFLOAD_TARGET_ACQUIRE          OFFLOAD_PREFIX(target_acquire)
-#define OFFLOAD_TARGET_ACQUIRE1         OFFLOAD_PREFIX(target_acquire1)
-#define OFFLOAD_OFFLOAD                 OFFLOAD_PREFIX(offload)
-#define OFFLOAD_OFFLOAD1                OFFLOAD_PREFIX(offload1)
-#define OFFLOAD_OFFLOAD2                OFFLOAD_PREFIX(offload2)
-#define OFFLOAD_CALL_COUNT              OFFLOAD_PREFIX(offload_call_count)
-
-
-/*! \fn OFFLOAD_TARGET_ACQUIRE
-    \brief Attempt to acquire the target.
-    \param target_type   The type of target.
-    \param target_number The device number.
-    \param is_optional   Whether CPU fall-back is allowed.
-    \param status        Address of variable to hold offload status.
-    \param file          Filename in which this offload occurred.
-    \param line          Line number in the file where this offload occurred.
-*/
-extern "C" OFFLOAD OFFLOAD_TARGET_ACQUIRE(
-    TARGET_TYPE      target_type,
-    int              target_number,
-    int              is_optional,
-    _Offload_status* status,
-    const char*      file,
-    uint64_t         line
-);
-
-/*! \fn OFFLOAD_TARGET_ACQUIRE1
-    \brief Acquire the target for offload (OpenMP).
-    \param device_number Device number or null if not specified.
-    \param file          Filename in which this offload occurred
-    \param line          Line number in the file where this offload occurred.
-*/
-extern "C" OFFLOAD OFFLOAD_TARGET_ACQUIRE1(
-    const int*      device_number,
-    const char*     file,
-    uint64_t        line
-);
-
-/*! \fn OFFLOAD_OFFLOAD1
-    \brief Run function on target using interface for old data persistence.
-    \param o Offload descriptor created by OFFLOAD_TARGET_ACQUIRE.
-    \param name Name of offload entry point.
-    \param is_empty If no code to execute (e.g. offload_transfer)
-    \param num_vars Number of variable descriptors.
-    \param vars Pointer to VarDesc array.
-    \param vars2 Pointer to VarDesc2 array.
-    \param num_waits Number of "wait" values.
-    \param waits Pointer to array of wait values.
-    \param signal Pointer to signal value or NULL.
-*/
-extern "C" int OFFLOAD_OFFLOAD1(
-    OFFLOAD o,
-    const char *name,
-    int is_empty,
-    int num_vars,
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int num_waits,
-    const void** waits,
-    const void** signal
-);
-
-/*! \fn OFFLOAD_OFFLOAD2
-    \brief Run function on target using interface for new data persistence.
-    \param o Offload descriptor created by OFFLOAD_TARGET_ACQUIRE.
-    \param name Name of offload entry point.
-    \param is_empty If no code to execute (e.g. offload_transfer)
-    \param num_vars Number of variable descriptors.
-    \param vars Pointer to VarDesc array.
-    \param vars2 Pointer to VarDesc2 array.
-    \param num_waits Number of "wait" values.
-    \param waits Pointer to array of wait values.
-    \param signal Pointer to signal value or NULL.
-    \param entry_id A signature for the function doing the offload.
-    \param stack_addr The stack frame address of the function doing offload.
-*/
-extern "C" int OFFLOAD_OFFLOAD2(
-    OFFLOAD o,
-    const char *name,
-    int is_empty,
-    int num_vars,
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int num_waits,
-    const void** waits,
-    const void** signal,
-    int entry_id,
-    const void *stack_addr
-);
-
-// Run function on target (obsolete).
-// @param o    OFFLOAD object
-// @param name function name
-extern "C" int OFFLOAD_OFFLOAD(
-    OFFLOAD o,
-    const char *name,
-    int is_empty,
-    int num_vars,
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int num_waits,
-    const void** waits,
-    const void* signal,
-    int entry_id = 0,
-    const void *stack_addr = NULL
-);
-
-// Global counter on host.
-// This variable is used if P2OPT_offload_do_data_persistence == 2.
-// The variable used to identify offload constructs contained in one procedure.
-// Call to OFFLOAD_CALL_COUNT() is inserted at HOST on entry of the routine.
-extern "C" int  OFFLOAD_CALL_COUNT();
-
-#endif // COMPILER_IF_HOST_H_INCLUDED
diff --git a/offload/src/compiler_if_target.cpp b/offload/src/compiler_if_target.cpp
deleted file mode 100644
index 1af82b8..0000000
--- a/offload/src/compiler_if_target.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "compiler_if_target.h"
-
-extern "C" void OFFLOAD_TARGET_ENTER(
-    OFFLOAD ofld,
-    int vars_total,
-    VarDesc *vars,
-    VarDesc2 *vars2
-)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p, %d, %p, %p)\n", __func__, ofld,
-                        vars_total, vars, vars2);
-    ofld->merge_var_descs(vars, vars2, vars_total);
-    ofld->scatter_copyin_data();
-}
-
-extern "C" void OFFLOAD_TARGET_LEAVE(
-    OFFLOAD ofld
-)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, ofld);
-    ofld->gather_copyout_data();
-}
-
-extern "C" void OFFLOAD_TARGET_MAIN(void)
-{
-    // initialize target part
-    __offload_target_init();
-
-    // pass control to COI
-    PipelineStartExecutingRunFunctions();
-    ProcessWaitForShutdown();
-
-    OFFLOAD_DEBUG_TRACE(2, "Exiting main...\n");
-}
diff --git a/offload/src/compiler_if_target.h b/offload/src/compiler_if_target.h
deleted file mode 100644
index 49d2c1c..0000000
--- a/offload/src/compiler_if_target.h
+++ /dev/null
@@ -1,50 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-/*! \file
-    \brief The interface between compiler-generated target code and runtime library
-*/
-
-#ifndef COMPILER_IF_TARGET_H_INCLUDED
-#define COMPILER_IF_TARGET_H_INCLUDED
-
-#include "offload_target.h"
-
-#define OFFLOAD_TARGET_ENTER            OFFLOAD_PREFIX(target_enter)
-#define OFFLOAD_TARGET_LEAVE            OFFLOAD_PREFIX(target_leave)
-#define OFFLOAD_TARGET_MAIN             OFFLOAD_PREFIX(target_main)
-
-/*! \fn OFFLOAD_TARGET_ENTER
-    \brief Fill in variable addresses using VarDesc array.
-    \brief Then call back the runtime library to fetch data.
-    \param ofld         Offload descriptor created by runtime.
-    \param var_desc_num Number of variable descriptors.
-    \param var_desc     Pointer to VarDesc array.
-    \param var_desc2    Pointer to VarDesc2 array.
-*/
-extern "C" void OFFLOAD_TARGET_ENTER(
-    OFFLOAD ofld,
-    int var_desc_num,
-    VarDesc *var_desc,
-    VarDesc2 *var_desc2
-);
-
-/*! \fn OFFLOAD_TARGET_LEAVE
-    \brief Call back the runtime library to gather outputs using VarDesc array.
-    \param ofld Offload descriptor created by OFFLOAD_TARGET_ACQUIRE.
-*/
-extern "C" void OFFLOAD_TARGET_LEAVE(
-    OFFLOAD ofld
-);
-
-// Entry point for the target application.
-extern "C" void OFFLOAD_TARGET_MAIN(void);
-
-#endif // COMPILER_IF_TARGET_H_INCLUDED
diff --git a/offload/src/dv_util.cpp b/offload/src/dv_util.cpp
deleted file mode 100644
index 4ad7271..0000000
--- a/offload/src/dv_util.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_common.h"
-
-bool __dv_is_contiguous(const ArrDesc *dvp)
-{
-    if (dvp->Flags & ArrDescFlagsContiguous) {
-        return true;
-    }
-
-    if (dvp->Rank != 0) {
-        if (dvp->Dim[0].Mult != dvp->Len) {
-            return false;
-        }
-        for (int i = 1; i < dvp->Rank; i++) {
-            if (dvp->Dim[i].Mult !=
-                dvp->Dim[i-1].Extent * dvp->Dim[i-1].Mult) {
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-bool __dv_is_allocated(const ArrDesc *dvp)
-{
-    return (dvp->Flags & ArrDescFlagsDefined);
-}
-
-uint64_t __dv_data_length(const ArrDesc *dvp)
-{
-    uint64_t size;
-
-    if (dvp->Rank == 0) {
-        size = dvp->Len;
-        return size;
-    }
-
-    size = dvp->Len;
-    for (int i = 0; i < dvp->Rank; ++i) {
-        size += (dvp->Dim[i].Extent-1) * dvp->Dim[i].Mult;
-    }
-    return size;
-}
-
-uint64_t __dv_data_length(const ArrDesc *dvp, int64_t count)
-{
-    if (dvp->Rank == 0) {
-        return count;
-    }
-
-    return count * dvp->Dim[0].Mult;
-}
-
-// Create CeanReadRanges data for reading contiguous ranges of
-// noncontiguous array defined by the argument
-CeanReadRanges * init_read_ranges_dv(const ArrDesc *dvp)
-{
-    int64_t         len;
-    int             count;
-    int             rank = dvp->Rank;
-    CeanReadRanges *res = NULL;
-
-    if (rank != 0) {
-        int i = 0;
-        len = dvp->Len;
-        if (dvp->Dim[0].Mult == len) {
-            for (i = 1; i < rank; i++) {
-                len *= dvp->Dim[i-1].Extent;
-                if (dvp->Dim[i].Mult != len) {
-                    break;
-                }
-            }
-        }
-        res = (CeanReadRanges *)malloc(
-            sizeof(CeanReadRanges) + (rank - i) * sizeof(CeanReadDim));
-        res -> last_noncont_ind = rank - i - 1;
-        count = 1;
-        for (; i < rank; i++) {
-            res->Dim[rank - i - 1].count = count;
-            res->Dim[rank - i - 1].size = dvp->Dim[i].Mult;
-            count *= dvp->Dim[i].Extent;
-        }
-        res -> range_max_number = count;
-        res -> range_size = len;
-        res -> ptr = (void*)dvp->Base;
-        res -> current_number = 0;
-        res -> init_offset = 0;
-    }
-    return res;
-}
-
-#if OFFLOAD_DEBUG > 0
-void __dv_desc_dump(const char *name, const ArrDesc *dvp)
-{
-    OFFLOAD_TRACE(3, "%s DV %p\n", name, dvp);
-
-    if (dvp != 0) {
-        OFFLOAD_TRACE(3,
-                      "    dv->Base   = 0x%lx\n"
-                      "    dv->Len    = 0x%lx\n"
-                      "    dv->Offset = 0x%lx\n"
-                      "    dv->Flags  = 0x%lx\n"
-                      "    dv->Rank   = 0x%lx\n"
-                      "    dv->Resrvd = 0x%lx\n",
-                      dvp->Base,
-                      dvp->Len,
-                      dvp->Offset,
-                      dvp->Flags,
-                      dvp->Rank,
-                      dvp->Reserved);
-
-        for (int i = 0 ; i < dvp->Rank; i++) {
-            OFFLOAD_TRACE(3,
-                          "    (%d) Extent=%ld, Multiplier=%ld, LowerBound=%ld\n",
-                          i,
-                          dvp->Dim[i].Extent,
-                          dvp->Dim[i].Mult,
-                          dvp->Dim[i].LowerBound);
-        }
-    }
-}
-#endif // OFFLOAD_DEBUG > 0
diff --git a/offload/src/dv_util.h b/offload/src/dv_util.h
deleted file mode 100644
index fdfa77d..0000000
--- a/offload/src/dv_util.h
+++ /dev/null
@@ -1,63 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef DV_UTIL_H_INCLUDED
-#define DV_UTIL_H_INCLUDED
-
-#include <stdint.h>
-
-// Dope vector declarations
-#define ArrDescMaxArrayRank         31
-
-// Dope vector flags
-#define ArrDescFlagsDefined         1
-#define ArrDescFlagsNodealloc       2
-#define ArrDescFlagsContiguous      4
-
-typedef int64_t dv_size;
-
-typedef struct DimDesc {
-    dv_size        Extent;      // Number of elements in this dimension
-    dv_size        Mult;        // Multiplier for this dimension.
-                                // The number of bytes between successive
-                                // elements in this dimension.
-    dv_size        LowerBound;  // LowerBound of this dimension
-} DimDesc ;
-
-typedef struct ArrDesc {
-    dv_size        Base;        // Base address
-    dv_size        Len;         // Length of data type, used only for
-                                // character strings.
-    dv_size        Offset;
-    dv_size        Flags;       // Flags
-    dv_size        Rank;        // Rank of pointer
-    dv_size        Reserved;    // reserved for openmp requests
-    DimDesc Dim[ArrDescMaxArrayRank];
-} ArrDesc ;
-
-typedef ArrDesc* pArrDesc;
-
-bool __dv_is_contiguous(const ArrDesc *dvp);
-
-bool __dv_is_allocated(const ArrDesc *dvp);
-
-uint64_t __dv_data_length(const ArrDesc *dvp);
-
-uint64_t __dv_data_length(const ArrDesc *dvp, int64_t nelems);
-
-CeanReadRanges * init_read_ranges_dv(const ArrDesc *dvp);
-
-#if OFFLOAD_DEBUG > 0
-void    __dv_desc_dump(const char *name, const ArrDesc *dvp);
-#else // OFFLOAD_DEBUG
-#define __dv_desc_dump(name, dvp)
-#endif // OFFLOAD_DEBUG
-
-#endif // DV_UTIL_H_INCLUDED
diff --git a/offload/src/liboffload_error.c b/offload/src/liboffload_error.c
deleted file mode 100644
index fc15f8b..0000000
--- a/offload/src/liboffload_error.c
+++ /dev/null
@@ -1,452 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include <stdio.h>
-#include <stdarg.h>
-#ifndef va_copy
-#define va_copy(dst, src) ((dst) = (src))
-#endif
-
-#include "liboffload_msg.h"
-
-#include "liboffload_error_codes.h"
-
-/***********************************************/
-/* error-handling function, liboffload_error_support */
-/***********************************************/
-
-void __liboffload_error_support(error_types input_tag, ...)
-{
-    va_list args;
-    va_start(args, input_tag);
-
-    switch (input_tag) {
-        case c_device_is_not_available:
-            write_message(stderr, msg_c_device_is_not_available, args);
-            break;
-        case c_invalid_device_number:
-            write_message(stderr, msg_c_invalid_device_number, args);
-            break;
-        case c_send_func_ptr:
-            write_message(stderr, msg_c_send_func_ptr, args);
-            break;
-        case c_receive_func_ptr:
-            write_message(stderr, msg_c_receive_func_ptr, args);
-            break;
-        case c_offload_malloc:
-            write_message(stderr, msg_c_offload_malloc, args);
-            break;
-        case c_offload1:
-            write_message(stderr, msg_c_offload1, args);
-            break;
-        case c_unknown_var_type:
-            write_message(stderr, c_unknown_var_type, args);
-            break;
-        case c_invalid_env_var_value:
-            write_message(stderr, msg_c_invalid_env_var_value, args);
-            break;
-        case c_invalid_env_var_int_value:
-            write_message(stderr, msg_c_invalid_env_var_int_value, args);
-            break;
-        case c_invalid_env_report_value:
-            write_message(stderr, msg_c_invalid_env_report_value, args);
-            break;
-        case c_offload_signaled1:
-            write_message(stderr, msg_c_offload_signaled1, args);
-            break;
-        case c_offload_signaled2:
-            write_message(stderr, msg_c_offload_signaled2, args);
-            break;
-        case c_myowrapper_checkresult:
-            write_message(stderr, msg_c_myowrapper_checkresult, args);
-            break;
-        case c_myotarget_checkresult:
-            write_message(stderr, msg_c_myotarget_checkresult, args);
-            break;
-        case c_offload_descriptor_offload:
-            write_message(stderr, msg_c_offload_descriptor_offload, args);
-            break;
-        case c_merge_var_descs1:
-            write_message(stderr, msg_c_merge_var_descs1, args);
-            break;
-        case c_merge_var_descs2:
-            write_message(stderr, msg_c_merge_var_descs2, args);
-            break;
-        case c_mic_parse_env_var_list1:
-            write_message(stderr, msg_c_mic_parse_env_var_list1, args);
-            break;
-        case c_mic_parse_env_var_list2:
-            write_message(stderr, msg_c_mic_parse_env_var_list2, args);
-            break;
-        case c_mic_process_exit_ret:
-            write_message(stderr, msg_c_mic_process_exit_ret, args);
-            break;
-        case c_mic_process_exit_sig:
-            write_message(stderr, msg_c_mic_process_exit_sig, args);
-            break;
-        case c_mic_process_exit:
-            write_message(stderr, msg_c_mic_process_exit, args);
-            break;
-        case c_mic_init3:
-            write_message(stderr, msg_c_mic_init3, args);
-            break;
-        case c_mic_init4:
-            write_message(stderr, msg_c_mic_init4, args);
-            break;
-        case c_mic_init5:
-            write_message(stderr, msg_c_mic_init5, args);
-            break;
-        case c_mic_init6:
-            write_message(stderr, msg_c_mic_init6, args);
-            break;
-        case c_no_static_var_data:
-            write_message(stderr, msg_c_no_static_var_data, args);
-            break;
-        case c_no_ptr_data:
-            write_message(stderr, msg_c_no_ptr_data, args);
-            break;
-        case c_get_engine_handle:
-            write_message(stderr, msg_c_get_engine_handle, args);
-            break;
-        case c_get_engine_index:
-            write_message(stderr, msg_c_get_engine_index, args);
-            break;
-        case c_process_create:
-            write_message(stderr, msg_c_process_create, args);
-            break;
-        case c_process_wait_shutdown:
-            write_message(stderr, msg_c_process_wait_shutdown, args);
-            break;
-        case c_process_proxy_flush:
-            write_message(stderr, msg_c_process_proxy_flush, args);
-            break;
-        case c_process_get_func_handles:
-            write_message(stderr, msg_c_process_get_func_handles, args);
-            break;
-        case c_load_library:
-            write_message(stderr, msg_c_load_library, args);
-            break;
-        case c_coipipe_max_number:
-            write_message(stderr, msg_c_coi_pipeline_max_number, args);
-            break;
-        case c_pipeline_create:
-            write_message(stderr, msg_c_pipeline_create, args);
-            break;
-        case c_pipeline_run_func:
-            write_message(stderr, msg_c_pipeline_run_func, args);
-            break;
-        case c_pipeline_start_run_funcs:
-            write_message(stderr, msg_c_pipeline_start_run_funcs, args);
-            break;
-        case c_buf_create:
-            write_message(stderr, msg_c_buf_create, args);
-            break;
-        case c_buf_create_out_of_mem:
-            write_message(stderr, msg_c_buf_create_out_of_mem, args);
-            break;
-        case c_buf_create_from_mem:
-            write_message(stderr, msg_c_buf_create_from_mem, args);
-            break;
-        case c_buf_destroy:
-            write_message(stderr, msg_c_buf_destroy, args);
-            break;
-        case c_buf_map:
-            write_message(stderr, msg_c_buf_map, args);
-            break;
-        case c_buf_unmap:
-            write_message(stderr, msg_c_buf_unmap, args);
-            break;
-        case c_buf_read:
-            write_message(stderr, msg_c_buf_read, args);
-            break;
-        case c_buf_write:
-            write_message(stderr, msg_c_buf_write, args);
-            break;
-        case c_buf_copy:
-            write_message(stderr, msg_c_buf_copy, args);
-            break;
-        case c_buf_get_address:
-            write_message(stderr, msg_c_buf_get_address, args);
-            break;
-        case c_buf_add_ref:
-            write_message(stderr, msg_c_buf_add_ref, args);
-            break;
-        case c_buf_release_ref:
-            write_message(stderr, msg_c_buf_release_ref, args);
-            break;
-        case c_buf_set_state:
-            write_message(stderr, msg_c_buf_set_state, args);
-            break;
-        case c_event_wait:
-            write_message(stderr, msg_c_event_wait, args);
-            break;
-        case c_zero_or_neg_ptr_len:
-            write_message(stderr, msg_c_zero_or_neg_ptr_len, args);
-            break;
-        case c_zero_or_neg_transfer_size:
-            write_message(stderr, msg_c_zero_or_neg_transfer_size, args);
-            break;
-        case c_bad_ptr_mem_range:
-            write_message(stderr, msg_c_bad_ptr_mem_range, args);
-            break;
-        case c_different_src_and_dstn_sizes:
-            write_message(stderr, msg_c_different_src_and_dstn_sizes, args);
-            break;
-        case c_ranges_dont_match:
-            write_message(stderr, msg_c_ranges_dont_match, args);
-            break;
-        case c_destination_is_over:
-            write_message(stderr, msg_c_destination_is_over, args);
-            break;
-        case c_slice_of_noncont_array:
-            write_message(stderr, msg_c_slice_of_noncont_array, args);
-            break;
-        case c_non_contiguous_dope_vector:
-            write_message(stderr, msg_c_non_contiguous_dope_vector, args);
-            break;
-        case c_pointer_array_mismatch:
-            write_message(stderr, msg_c_pointer_array_mismatch, args);
-            break;
-        case c_omp_invalid_device_num_env:
-            write_message(stderr, msg_c_omp_invalid_device_num_env, args);
-            break;
-        case c_omp_invalid_device_num:
-            write_message(stderr, msg_c_omp_invalid_device_num, args);
-            break;
-        case c_unknown_binary_type:
-            write_message(stderr, msg_c_unknown_binary_type, args);
-            break;
-        case c_multiple_target_exes:
-            write_message(stderr, msg_c_multiple_target_exes, args);
-            break;
-        case c_no_target_exe:
-            write_message(stderr, msg_c_no_target_exe, args);
-            break;
-        case c_report_unknown_timer_node:
-            write_message(stderr, msg_c_report_unknown_timer_node, args);
-            break;
-        case c_report_unknown_trace_node:
-            write_message(stderr, msg_c_report_unknown_trace_node, args);
-            break;
-    }
-    va_end(args);
-}
-
-char const * report_get_message_str(error_types input_tag)
-{
-    switch (input_tag) {
-        case c_report_title:
-            return (offload_get_message_str(msg_c_report_title));
-        case c_report_from_file:
-            return (offload_get_message_str(msg_c_report_from_file));
-        case c_report_offload:
-            return (offload_get_message_str(msg_c_report_offload));
-        case c_report_mic:
-            return (offload_get_message_str(msg_c_report_mic));
-        case c_report_file:
-            return (offload_get_message_str(msg_c_report_file));
-        case c_report_line:
-            return (offload_get_message_str(msg_c_report_line));
-        case c_report_host:
-            return (offload_get_message_str(msg_c_report_host));
-        case c_report_tag:
-            return (offload_get_message_str(msg_c_report_tag));
-        case c_report_cpu_time:
-            return (offload_get_message_str(msg_c_report_cpu_time));
-        case c_report_seconds:
-            return (offload_get_message_str(msg_c_report_seconds));
-        case c_report_cpu_to_mic_data:
-            return (offload_get_message_str(msg_c_report_cpu_to_mic_data));
-        case c_report_bytes:
-            return (offload_get_message_str(msg_c_report_bytes));
-        case c_report_mic_time:
-            return (offload_get_message_str(msg_c_report_mic_time));
-        case c_report_mic_to_cpu_data:
-            return (offload_get_message_str(msg_c_report_mic_to_cpu_data));
-        case c_report_compute:
-            return (offload_get_message_str(msg_c_report_compute));
-        case c_report_copyin_data:
-            return (offload_get_message_str(msg_c_report_copyin_data));
-        case c_report_copyout_data:
-            return (offload_get_message_str(msg_c_report_copyout_data));
-        case c_report_create_buf_host:
-            return (offload_get_message_str(c_report_create_buf_host));
-        case c_report_create_buf_mic:
-            return (offload_get_message_str(msg_c_report_create_buf_mic));
-        case c_report_destroy:
-            return (offload_get_message_str(msg_c_report_destroy));
-        case c_report_gather_copyin_data:
-            return (offload_get_message_str(msg_c_report_gather_copyin_data));
-        case c_report_gather_copyout_data:
-            return (offload_get_message_str(msg_c_report_gather_copyout_data));
-        case c_report_state_signal:
-            return (offload_get_message_str(msg_c_report_state_signal));
-        case c_report_signal:
-            return (offload_get_message_str(msg_c_report_signal));
-        case c_report_wait:
-            return (offload_get_message_str(msg_c_report_wait));
-        case c_report_init:
-            return (offload_get_message_str(msg_c_report_init));
-        case c_report_init_func:
-            return (offload_get_message_str(msg_c_report_init_func));
-        case c_report_logical_card:
-            return (offload_get_message_str(msg_c_report_logical_card));
-        case c_report_mic_myo_fptr:
-            return (offload_get_message_str(msg_c_report_mic_myo_fptr));
-        case c_report_mic_myo_shared:
-            return (offload_get_message_str(msg_c_report_mic_myo_shared));
-        case c_report_myoacquire:
-            return (offload_get_message_str(msg_c_report_myoacquire));
-        case c_report_myofini:
-            return (offload_get_message_str(msg_c_report_myofini));
-        case c_report_myoinit:
-            return (offload_get_message_str(msg_c_report_myoinit));
-        case c_report_myoregister:
-            return (offload_get_message_str(msg_c_report_myoregister));
-        case c_report_myorelease:
-            return (offload_get_message_str(msg_c_report_myorelease));
-        case c_report_myosharedalignedfree:
-            return (
-                offload_get_message_str(msg_c_report_myosharedalignedfree));
-        case c_report_myosharedalignedmalloc:
-            return (
-                offload_get_message_str(msg_c_report_myosharedalignedmalloc));
-        case c_report_myosharedfree:
-            return (offload_get_message_str(msg_c_report_myosharedfree));
-        case c_report_myosharedmalloc:
-            return (offload_get_message_str(msg_c_report_myosharedmalloc));
-        case c_report_physical_card:
-            return (offload_get_message_str(msg_c_report_physical_card));
-        case c_report_receive_pointer_data:
-            return (
-                offload_get_message_str(msg_c_report_receive_pointer_data));
-        case c_report_received_pointer_data:
-            return (
-                offload_get_message_str(msg_c_report_received_pointer_data));
-        case c_report_register:
-            return (offload_get_message_str(msg_c_report_register));
-        case c_report_scatter_copyin_data:
-            return (offload_get_message_str(msg_c_report_scatter_copyin_data));
-        case c_report_scatter_copyout_data:
-            return (
-                offload_get_message_str(msg_c_report_scatter_copyout_data));
-        case c_report_send_pointer_data:
-            return (offload_get_message_str(msg_c_report_send_pointer_data));
-        case c_report_sent_pointer_data:
-            return (offload_get_message_str(msg_c_report_sent_pointer_data));
-        case c_report_start:
-            return (offload_get_message_str(msg_c_report_start));
-        case c_report_start_target_func:
-            return (offload_get_message_str(msg_c_report_start_target_func));
-        case c_report_state:
-            return (offload_get_message_str(msg_c_report_state));
-        case c_report_unregister:
-            return (offload_get_message_str(msg_c_report_unregister));
-        case c_report_var:
-            return (offload_get_message_str(msg_c_report_var));
-
-        default:
-            LIBOFFLOAD_ERROR(c_report_unknown_trace_node);
-            abort();
-    }
-}
-
-char const * report_get_host_stage_str(int i)
-{
-    switch (i) {
-        case c_offload_host_total_offload:
-            return (
-               offload_get_message_str(msg_c_report_host_total_offload_time));
-        case c_offload_host_initialize:
-            return (offload_get_message_str(msg_c_report_host_initialize));
-        case c_offload_host_target_acquire:
-            return (
-                offload_get_message_str(msg_c_report_host_target_acquire));
-        case c_offload_host_wait_deps:
-            return (offload_get_message_str(msg_c_report_host_wait_deps));
-        case c_offload_host_setup_buffers:
-            return (offload_get_message_str(msg_c_report_host_setup_buffers));
-        case c_offload_host_alloc_buffers:
-            return (offload_get_message_str(msg_c_report_host_alloc_buffers));
-        case c_offload_host_setup_misc_data:
-            return (
-                offload_get_message_str(msg_c_report_host_setup_misc_data));
-        case c_offload_host_alloc_data_buffer:
-            return (
-                offload_get_message_str(msg_c_report_host_alloc_data_buffer));
-        case c_offload_host_send_pointers:
-            return (offload_get_message_str(msg_c_report_host_send_pointers));
-        case c_offload_host_gather_inputs:
-            return (offload_get_message_str(msg_c_report_host_gather_inputs));
-        case c_offload_host_map_in_data_buffer:
-            return (
-                offload_get_message_str(msg_c_report_host_map_in_data_buffer));
-        case c_offload_host_unmap_in_data_buffer:
-            return (offload_get_message_str(
-                msg_c_report_host_unmap_in_data_buffer));
-        case c_offload_host_start_compute:
-            return (offload_get_message_str(msg_c_report_host_start_compute));
-        case c_offload_host_wait_compute:
-            return (offload_get_message_str(msg_c_report_host_wait_compute));
-        case c_offload_host_start_buffers_reads:
-            return (offload_get_message_str(
-                msg_c_report_host_start_buffers_reads));
-        case c_offload_host_scatter_outputs:
-            return (
-                offload_get_message_str(msg_c_report_host_scatter_outputs));
-        case c_offload_host_map_out_data_buffer:
-            return (offload_get_message_str(
-                msg_c_report_host_map_out_data_buffer));
-        case c_offload_host_unmap_out_data_buffer:
-            return (offload_get_message_str(
-                msg_c_report_host_unmap_out_data_buffer));
-        case c_offload_host_wait_buffers_reads:
-            return (
-                offload_get_message_str(msg_c_report_host_wait_buffers_reads));
-        case c_offload_host_destroy_buffers:
-            return (
-                offload_get_message_str(msg_c_report_host_destroy_buffers));
-        default:
-            LIBOFFLOAD_ERROR(c_report_unknown_timer_node);
-            abort();
-    }
-}
-
-char const * report_get_target_stage_str(int i)
-{
-    switch (i) {
-        case c_offload_target_total_time:
-            return (offload_get_message_str(msg_c_report_target_total_time));
-        case c_offload_target_descriptor_setup:
-            return (
-                offload_get_message_str(msg_c_report_target_descriptor_setup));
-        case c_offload_target_func_lookup:
-            return (offload_get_message_str(msg_c_report_target_func_lookup));
-        case c_offload_target_func_time:
-            return (offload_get_message_str(msg_c_report_target_func_time));
-        case c_offload_target_scatter_inputs:
-            return (
-                offload_get_message_str(msg_c_report_target_scatter_inputs));
-        case c_offload_target_add_buffer_refs:
-            return (
-                offload_get_message_str(msg_c_report_target_add_buffer_refs));
-        case c_offload_target_compute:
-            return (offload_get_message_str(msg_c_report_target_compute));
-        case c_offload_target_gather_outputs:
-            return (offload_get_message_str
-                (msg_c_report_target_gather_outputs));
-        case c_offload_target_release_buffer_refs:
-            return (offload_get_message_str(
-                msg_c_report_target_release_buffer_refs));
-        default:
-            LIBOFFLOAD_ERROR(c_report_unknown_timer_node);
-            abort();
-    }
-}
diff --git a/offload/src/liboffload_error_codes.h b/offload/src/liboffload_error_codes.h
deleted file mode 100644
index 982167b..0000000
--- a/offload/src/liboffload_error_codes.h
+++ /dev/null
@@ -1,276 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#if !defined(LIBOFFLOAD_ERROR_CODES_H)
-#define LIBOFFLOAD_ERROR_CODES_H
-#include <stdarg.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-typedef enum
-{
-    c_device_is_not_available = 0,
-    c_invalid_device_number,
-    c_offload1,
-    c_unknown_var_type,
-    c_send_func_ptr,
-    c_receive_func_ptr,
-    c_offload_malloc,
-    c_invalid_env_var_value,
-    c_invalid_env_var_int_value,
-    c_invalid_env_report_value,
-    c_offload_signaled1,
-    c_offload_signaled2,
-    c_myotarget_checkresult,
-    c_myowrapper_checkresult,
-    c_offload_descriptor_offload,
-    c_merge_var_descs1,
-    c_merge_var_descs2,
-    c_mic_parse_env_var_list1,
-    c_mic_parse_env_var_list2,
-    c_mic_process_exit_ret,
-    c_mic_process_exit_sig,
-    c_mic_process_exit,
-    c_mic_init3,
-    c_mic_init4,
-    c_mic_init5,
-    c_mic_init6,
-    c_no_static_var_data,
-    c_no_ptr_data,
-    c_get_engine_handle,
-    c_get_engine_index,
-    c_process_create,
-    c_process_get_func_handles,
-    c_process_wait_shutdown,
-    c_process_proxy_flush,
-    c_load_library,
-    c_pipeline_create,
-    c_pipeline_run_func,
-    c_pipeline_start_run_funcs,
-    c_buf_create,
-    c_buf_create_out_of_mem,
-    c_buf_create_from_mem,
-    c_buf_destroy,
-    c_buf_map,
-    c_buf_unmap,
-    c_buf_read,
-    c_buf_write,
-    c_buf_copy,
-    c_buf_get_address,
-    c_buf_add_ref,
-    c_buf_release_ref,
-    c_buf_set_state,
-    c_event_wait,
-    c_zero_or_neg_ptr_len,
-    c_zero_or_neg_transfer_size,
-    c_bad_ptr_mem_range,
-    c_different_src_and_dstn_sizes,
-    c_ranges_dont_match,
-    c_destination_is_over,
-    c_slice_of_noncont_array,
-    c_non_contiguous_dope_vector,
-    c_pointer_array_mismatch,
-    c_omp_invalid_device_num_env,
-    c_omp_invalid_device_num,
-    c_unknown_binary_type,
-    c_multiple_target_exes,
-    c_no_target_exe,
-    c_report_host,
-    c_report_target,
-    c_report_title,
-    c_report_from_file,
-    c_report_file,
-    c_report_line,
-    c_report_tag,
-    c_report_seconds,
-    c_report_bytes,
-    c_report_mic,
-    c_report_cpu_time,
-    c_report_cpu_to_mic_data,
-    c_report_mic_time,
-    c_report_mic_to_cpu_data,
-    c_report_unknown_timer_node,
-    c_report_unknown_trace_node,
-    c_report_offload,
-    c_report_w_tag,
-    c_report_state,
-    c_report_start,
-    c_report_init,
-    c_report_logical_card,
-    c_report_physical_card,
-    c_report_register,
-    c_report_init_func,
-    c_report_create_buf_host,
-    c_report_create_buf_mic,
-    c_report_send_pointer_data,
-    c_report_sent_pointer_data,
-    c_report_gather_copyin_data,
-    c_report_copyin_data,
-    c_report_state_signal,
-    c_report_signal,
-    c_report_wait,
-    c_report_compute,
-    c_report_receive_pointer_data,
-    c_report_received_pointer_data,
-    c_report_start_target_func,
-    c_report_var,
-    c_report_scatter_copyin_data,
-    c_report_gather_copyout_data,
-    c_report_scatter_copyout_data,
-    c_report_copyout_data,
-    c_report_unregister,
-    c_report_destroy,
-    c_report_myoinit,
-    c_report_myoregister,
-    c_report_myofini,
-    c_report_mic_myo_shared,
-    c_report_mic_myo_fptr,
-    c_report_myosharedmalloc,
-    c_report_myosharedfree,
-    c_report_myosharedalignedmalloc,
-    c_report_myosharedalignedfree,
-    c_report_myoacquire,
-    c_report_myorelease,
-    c_coipipe_max_number
-} error_types;
-
-enum OffloadHostPhase {
-    // Total time on host for entire offload
-    c_offload_host_total_offload = 0,
-
-    // Time to load target binary
-    c_offload_host_initialize,
-
-    // Time to acquire lrb availability dynamically
-    c_offload_host_target_acquire,
-
-    // Time to wait for dependencies
-    c_offload_host_wait_deps,
-
-    // Time to allocate pointer buffers, initiate writes for pointers
-    // and calculate size of copyin/copyout buffer
-    c_offload_host_setup_buffers,
-
-    // Time to allocate pointer buffers
-    c_offload_host_alloc_buffers,
-
-    // Time to initialize misc data
-    c_offload_host_setup_misc_data,
-
-    // Time to allocate copyin/copyout buffer
-    c_offload_host_alloc_data_buffer,
-
-    // Time to initiate writes from host pointers to buffers
-    c_offload_host_send_pointers,
-
-    // Time to Gather IN data of offload into buffer
-    c_offload_host_gather_inputs,
-
-    // Time to map buffer
-    c_offload_host_map_in_data_buffer,
-
-    // Time to unmap buffer
-    c_offload_host_unmap_in_data_buffer,
-
-    // Time to start remote function call that does computation on lrb
-    c_offload_host_start_compute,
-
-    // Time to wait for compute to finish
-    c_offload_host_wait_compute,
-
-    // Time to initiate reads from pointer buffers
-    c_offload_host_start_buffers_reads,
-
-    // Time to update host variabels with OUT data from buffer
-    c_offload_host_scatter_outputs,
-
-    // Time to map buffer
-    c_offload_host_map_out_data_buffer,
-
-    // Time to unmap buffer
-    c_offload_host_unmap_out_data_buffer,
-
-    // Time to wait reads from buffers to finish
-    c_offload_host_wait_buffers_reads,
-
-    // Time to destroy buffers that are no longer needed
-    c_offload_host_destroy_buffers,
-
-    // LAST TIME MONITOR
-    c_offload_host_max_phase
-};
-
-enum OffloadTargetPhase {
-    // Total time spent on the target
-    c_offload_target_total_time = 0,
-
-    // Time to initialize offload descriptor
-    c_offload_target_descriptor_setup,
-
-    // Time to find target entry point in lookup table
-    c_offload_target_func_lookup,
-
-    // Total time spend executing offload entry
-    c_offload_target_func_time,
-
-    // Time to initialize target variables with IN values from buffer
-    c_offload_target_scatter_inputs,
-
-    // Time to add buffer reference for pointer buffers
-    c_offload_target_add_buffer_refs,
-
-    // Total time on lrb for computation
-    c_offload_target_compute,
-
-    // On lrb, time to copy OUT into buffer
-    c_offload_target_gather_outputs,
-
-    // Time to release buffer references
-    c_offload_target_release_buffer_refs,
-
-    // LAST TIME MONITOR
-    c_offload_target_max_phase
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-void __liboffload_error_support(error_types input_tag, ...);
-void __liboffload_report_support(error_types input_tag, ...);
-char const *offload_get_message_str(int msgCode);
-char const * report_get_message_str(error_types input_tag);
-char const * report_get_host_stage_str(int i);
-char const * report_get_target_stage_str(int i);
-#ifdef __cplusplus
-}
-#endif
-
-#define test_msg_cat(nm, msg) \
-    fprintf(stderr, "\t TEST for %s \n \t", nm); \
-    __liboffload_error_support(msg);
-
-#define test_msg_cat1(nm, msg, ...) \
-    fprintf(stderr, "\t TEST for %s \n \t", nm); \
-    __liboffload_error_support(msg, __VA_ARGS__);
-
-void write_message(FILE * file, int msgCode, va_list args_p);
-
-#define LIBOFFLOAD_ERROR __liboffload_error_support
-
-#ifdef TARGET_WINNT
-#define LIBOFFLOAD_ABORT \
-         _set_abort_behavior(0, _WRITE_ABORT_MSG); \
-         abort()
-#else
-#define LIBOFFLOAD_ABORT \
-         abort()
-#endif
-
-#endif // !defined(LIBOFFLOAD_ERROR_CODES_H)
diff --git a/offload/src/liboffload_msg.c b/offload/src/liboffload_msg.c
deleted file mode 100644
index b160392..0000000
--- a/offload/src/liboffload_msg.c
+++ /dev/null
@@ -1,35 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-
-#include <stdio.h>
-
-// ===========================================================================
-// Bring in the static string table and the enumerations for indexing into
-// it.
-// ===========================================================================
-
-#include "liboffload_msg.h"
-
-# define DYNART_STDERR_PUTS(__message_text__) fputs((__message_text__),stderr)
-
-// ===========================================================================
-// Now the code for accessing the message catalogs
-// ===========================================================================
-
-
-    void write_message(FILE * file, int msgCode) {
-        fputs(MESSAGE_TABLE_NAME[ msgCode ], file);
-        fflush(file);
-    }
-
-    char const *offload_get_message_str(int msgCode) {
-        return MESSAGE_TABLE_NAME[ msgCode ];
-    }
diff --git a/offload/src/liboffload_msg.h b/offload/src/liboffload_msg.h
deleted file mode 100644
index c1445f9..0000000
--- a/offload/src/liboffload_msg.h
+++ /dev/null
@@ -1,326 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-// file: liboffload_msg.h
-enum {
-	__dummy__ = 0,
-	msg_c_device_is_not_available,
-	msg_c_invalid_device_number,
-	msg_c_send_func_ptr,
-	msg_c_receive_func_ptr,
-	msg_c_offload_malloc,
-	msg_c_offload1,
-	msg_c_unknown_var_type,
-	msg_c_invalid_env_var_value,
-	msg_c_invalid_env_var_int_value,
-	msg_c_invalid_env_report_value,
-	msg_c_offload_signaled1,
-	msg_c_offload_signaled2,
-	msg_c_myowrapper_checkresult,
-	msg_c_myotarget_checkresult,
-	msg_c_offload_descriptor_offload,
-	msg_c_merge_var_descs1,
-	msg_c_merge_var_descs2,
-	msg_c_mic_parse_env_var_list1,
-	msg_c_mic_parse_env_var_list2,
-	msg_c_mic_process_exit_ret,
-	msg_c_mic_process_exit_sig,
-	msg_c_mic_process_exit,
-	msg_c_mic_init3,
-	msg_c_mic_init4,
-	msg_c_mic_init5,
-	msg_c_mic_init6,
-	msg_c_no_static_var_data,
-	msg_c_no_ptr_data,
-	msg_c_get_engine_handle,
-	msg_c_get_engine_index,
-	msg_c_process_create,
-	msg_c_process_get_func_handles,
-	msg_c_process_wait_shutdown,
-	msg_c_process_proxy_flush,
-	msg_c_load_library,
-	msg_c_pipeline_create,
-	msg_c_pipeline_run_func,
-	msg_c_pipeline_start_run_funcs,
-	msg_c_buf_create,
-	msg_c_buf_create_out_of_mem,
-	msg_c_buf_create_from_mem,
-	msg_c_buf_destroy,
-	msg_c_buf_map,
-	msg_c_buf_unmap,
-	msg_c_buf_read,
-	msg_c_buf_write,
-	msg_c_buf_copy,
-	msg_c_buf_get_address,
-	msg_c_buf_add_ref,
-	msg_c_buf_release_ref,
-	msg_c_buf_set_state,
-	msg_c_event_wait,
-	msg_c_zero_or_neg_ptr_len,
-	msg_c_zero_or_neg_transfer_size,
-	msg_c_bad_ptr_mem_range,
-	msg_c_different_src_and_dstn_sizes,
-	msg_c_non_contiguous_dope_vector,
-	msg_c_omp_invalid_device_num_env,
-	msg_c_omp_invalid_device_num,
-	msg_c_unknown_binary_type,
-	msg_c_multiple_target_exes,
-	msg_c_no_target_exe,
-	msg_c_report_unknown_timer_node,
-	msg_c_report_unknown_trace_node,
-	msg_c_report_host,
-	msg_c_report_mic,
-	msg_c_report_title,
-	msg_c_report_seconds,
-	msg_c_report_bytes,
-	msg_c_report_cpu_time,
-	msg_c_report_mic_time,
-	msg_c_report_tag,
-	msg_c_report_from_file,
-	msg_c_report_file,
-	msg_c_report_line,
-	msg_c_report_cpu_to_mic_data,
-	msg_c_report_mic_to_cpu_data,
-	msg_c_report_offload,
-	msg_c_report_w_tag,
-	msg_c_report_state,
-	msg_c_report_start,
-	msg_c_report_init,
-	msg_c_report_logical_card,
-	msg_c_report_physical_card,
-	msg_c_report_register,
-	msg_c_report_init_func,
-	msg_c_report_create_buf_host,
-	msg_c_report_create_buf_mic,
-	msg_c_report_send_pointer_data,
-	msg_c_report_sent_pointer_data,
-	msg_c_report_gather_copyin_data,
-	msg_c_report_copyin_data,
-	msg_c_report_state_signal,
-	msg_c_report_signal,
-	msg_c_report_wait,
-	msg_c_report_compute,
-	msg_c_report_receive_pointer_data,
-	msg_c_report_received_pointer_data,
-	msg_c_report_start_target_func,
-	msg_c_report_var,
-	msg_c_report_scatter_copyin_data,
-	msg_c_report_gather_copyout_data,
-	msg_c_report_scatter_copyout_data,
-	msg_c_report_copyout_data,
-	msg_c_report_unregister,
-	msg_c_report_destroy,
-	msg_c_report_myoinit,
-	msg_c_report_myoregister,
-	msg_c_report_myofini,
-	msg_c_report_mic_myo_shared,
-	msg_c_report_mic_myo_fptr,
-	msg_c_report_myosharedmalloc,
-	msg_c_report_myosharedfree,
-	msg_c_report_myosharedalignedmalloc,
-	msg_c_report_myosharedalignedfree,
-	msg_c_report_myoacquire,
-	msg_c_report_myorelease,
-	msg_c_report_host_total_offload_time,
-	msg_c_report_host_initialize,
-	msg_c_report_host_target_acquire,
-	msg_c_report_host_wait_deps,
-	msg_c_report_host_setup_buffers,
-	msg_c_report_host_alloc_buffers,
-	msg_c_report_host_setup_misc_data,
-	msg_c_report_host_alloc_data_buffer,
-	msg_c_report_host_send_pointers,
-	msg_c_report_host_gather_inputs,
-	msg_c_report_host_map_in_data_buffer,
-	msg_c_report_host_unmap_in_data_buffer,
-	msg_c_report_host_start_compute,
-	msg_c_report_host_wait_compute,
-	msg_c_report_host_start_buffers_reads,
-	msg_c_report_host_scatter_outputs,
-	msg_c_report_host_map_out_data_buffer,
-	msg_c_report_host_unmap_out_data_buffer,
-	msg_c_report_host_wait_buffers_reads,
-	msg_c_report_host_destroy_buffers,
-	msg_c_report_target_total_time,
-	msg_c_report_target_descriptor_setup,
-	msg_c_report_target_func_lookup,
-	msg_c_report_target_func_time,
-	msg_c_report_target_scatter_inputs,
-	msg_c_report_target_add_buffer_refs,
-	msg_c_report_target_compute,
-	msg_c_report_target_gather_outputs,
-	msg_c_report_target_release_buffer_refs,
-	msg_c_coi_pipeline_max_number,
-	msg_c_ranges_dont_match,
-	msg_c_destination_is_over,
-	msg_c_slice_of_noncont_array,
-	msg_c_pointer_array_mismatch,
-	lastMsg = 152,
-	firstMsg = 1
-};
-
-
-#if !defined(MESSAGE_TABLE_NAME)
-#    define MESSAGE_TABLE_NAME __liboffload_message_table
-#endif
-
-static char const * MESSAGE_TABLE_NAME[] = {
-	/*   0 __dummy__                               */ "Un-used message",
-	/*   1 msg_c_device_is_not_available           */ "offload error: cannot offload to MIC - device is not available",
-	/*   2 msg_c_invalid_device_number             */ "offload error: expected a number greater than or equal to -1",
-	/*   3 msg_c_send_func_ptr                     */ "offload error: cannot find function name for address %p",
-	/*   4 msg_c_receive_func_ptr                  */ "offload error: cannot find address of function %s",
-	/*   5 msg_c_offload_malloc                    */ "offload error: memory allocation failed (requested=%lld bytes, align %lld)",
-	/*   6 msg_c_offload1                          */ "offload error: device %d does not have a pending signal for wait(%p)",
-	/*   7 msg_c_unknown_var_type                  */ "offload error: unknown variable type %d",
-	/*   8 msg_c_invalid_env_var_value             */ "offload warning: ignoring invalid value specified for %s",
-	/*   9 msg_c_invalid_env_var_int_value         */ "offload warning: specify an integer value for %s",
-	/*  10 msg_c_invalid_env_report_value          */ "offload warning: ignoring %s setting; use a value in range 1-3",
-	/*  11 msg_c_offload_signaled1                 */ "offload error: invalid device number %d specified in _Offload_signaled",
-	/*  12 msg_c_offload_signaled2                 */ "offload error: invalid signal %p specified for _Offload_signaled",
-	/*  13 msg_c_myowrapper_checkresult            */ "offload error: %s failed with error %d",
-	/*  14 msg_c_myotarget_checkresult             */ "offload error: %s failed with error %d",
-	/*  15 msg_c_offload_descriptor_offload        */ "offload error: cannot find offload entry %s",
-	/*  16 msg_c_merge_var_descs1                  */ "offload error: unexpected number of variable descriptors",
-	/*  17 msg_c_merge_var_descs2                  */ "offload error: unexpected variable type",
-	/*  18 msg_c_mic_parse_env_var_list1           */ "offload_error: MIC environment variable must begin with an alpabetic character",
-	/*  19 msg_c_mic_parse_env_var_list2           */ "offload_error: MIC environment variable value must be specified with ‘=’",
-	/*  20 msg_c_mic_process_exit_ret              */ "offload error: process on the device %d unexpectedly exited with code %d",
-	/*  21 msg_c_mic_process_exit_sig              */ "offload error: process on the device %d was terminated by signal %d (%s)",
-	/*  22 msg_c_mic_process_exit                  */ "offload error: process on the device %d was unexpectedly terminated",
-	/*  23 msg_c_mic_init3                         */ "offload warning: ignoring MIC_STACKSIZE setting; use a value >= 16K and a multiple of 4K",
-	/*  24 msg_c_mic_init4                         */ "offload error: thread key create failed with error %d",
-	/*  25 msg_c_mic_init5                         */ "offload warning: specify OFFLOAD_DEVICES as comma-separated physical device numbers or 'none'",
-	/*  26 msg_c_mic_init6                         */ "offload warning: OFFLOAD_DEVICES device number %d does not correspond to a physical device",
-	/*  27 msg_c_no_static_var_data                */ "offload error: cannot find data associated with statically allocated variable %p",
-	/*  28 msg_c_no_ptr_data                       */ "offload error: cannot find data associated with pointer variable %p",
-	/*  29 msg_c_get_engine_handle                 */ "offload error: cannot get device %d handle (error code %d)",
-	/*  30 msg_c_get_engine_index                  */ "offload error: cannot get physical index for logical device %d (error code %d)",
-	/*  31 msg_c_process_create                    */ "offload error: cannot start process on the device %d (error code %d)",
-	/*  32 msg_c_process_get_func_handles          */ "offload error: cannot get function handles on the device %d (error code %d)",
-	/*  33 msg_c_process_wait_shutdown             */ "offload error: wait for process shutdown failed on device %d (error code %d)",
-	/*  34 msg_c_process_proxy_flush               */ "offload error: cannot flush process output on device %d (error code %d)",
-	/*  35 msg_c_load_library                      */ "offload error: cannot load library to the device %d (error code %d)",
-	/*  36 msg_c_pipeline_create                   */ "offload error: cannot create pipeline on the device %d (error code %d)",
-	/*  37 msg_c_pipeline_run_func                 */ "offload error: cannot execute function on the device %d (error code %d)",
-	/*  38 msg_c_pipeline_start_run_funcs          */ "offload error: cannot start executing pipeline function on the device %d (error code %d)",
-	/*  39 msg_c_buf_create                        */ "offload error: cannot create buffer on device %d (error code %d)",
-	/*  40 msg_c_buf_create_out_of_mem             */ "offload error: cannot create buffer on device %d, out of memory",
-	/*  41 msg_c_buf_create_from_mem               */ "offload error: cannot create buffer from memory on device %d (error code %d)",
-	/*  42 msg_c_buf_destroy                       */ "offload error: buffer destroy failed (error code %d)",
-	/*  43 msg_c_buf_map                           */ "offload error: buffer map failed (error code %d)",
-	/*  44 msg_c_buf_unmap                         */ "offload error: buffer unmap failed (error code %d)",
-	/*  45 msg_c_buf_read                          */ "offload error: buffer read failed (error code %d)",
-	/*  46 msg_c_buf_write                         */ "offload error: buffer write failed (error code %d)",
-	/*  47 msg_c_buf_copy                          */ "offload error: buffer copy failed (error code %d)",
-	/*  48 msg_c_buf_get_address                   */ "offload error: cannot get buffer address on device %d (error code %d)",
-	/*  49 msg_c_buf_add_ref                       */ "offload error: cannot reuse buffer memory on device %d (error code %d)",
-	/*  50 msg_c_buf_release_ref                   */ "offload error: cannot release buffer memory on device %d (error code %d)",
-	/*  51 msg_c_buf_set_state                     */ "offload error: buffer set state failed (error code %d)",
-	/*  52 msg_c_event_wait                        */ "offload error: wait for event to become signaled failed (error code %d)",
-	/*  53 msg_c_zero_or_neg_ptr_len               */ "offload error: memory allocation of negative length is not supported",
-	/*  54 msg_c_zero_or_neg_transfer_size         */ "offload error: data transfer of zero or negative size is not supported",
-	/*  55 msg_c_bad_ptr_mem_range                 */ "offload error: address range partially overlaps with existing allocation",
-	/*  56 msg_c_different_src_and_dstn_sizes      */ "offload error: size of the source %d differs from size of the destination %d",
-	/*  57 msg_c_non_contiguous_dope_vector        */ "offload error: offload data transfer supports only a single contiguous memory range per variable",
-	/*  58 msg_c_omp_invalid_device_num_env        */ "offload warning: ignoring %s setting; use a non-negative integer value",
-	/*  59 msg_c_omp_invalid_device_num            */ "offload error: device number should be a non-negative integer value",
-	/*  60 msg_c_unknown_binary_type               */ "offload error: unexpected embedded target binary type, expected either an executable or shared library",
-	/*  61 msg_c_multiple_target_exes              */ "offload error: more that one target executable found",
-	/*  62 msg_c_no_target_exe                     */ "offload error: target executable is not available",
-	/*  63 msg_c_report_unknown_timer_node         */ "offload error: unknown timer node",
-	/*  64 msg_c_report_unknown_trace_node         */ "offload error: unknown trace node",
-	/*  65 msg_c_report_host                       */ "HOST",
-	/*  66 msg_c_report_mic                        */ "MIC",
-	/*  67 msg_c_report_title                      */ "timer data       (sec)",
-	/*  68 msg_c_report_seconds                    */ "(seconds)",
-	/*  69 msg_c_report_bytes                      */ "(bytes)",
-	/*  70 msg_c_report_cpu_time                   */ "CPU Time",
-	/*  71 msg_c_report_mic_time                   */ "MIC Time",
-	/*  72 msg_c_report_tag                        */ "Tag",
-	/*  73 msg_c_report_from_file                  */ "Offload from file",
-	/*  74 msg_c_report_file                       */ "File",
-	/*  75 msg_c_report_line                       */ "Line",
-	/*  76 msg_c_report_cpu_to_mic_data            */ "CPU->MIC Data",
-	/*  77 msg_c_report_mic_to_cpu_data            */ "MIC->CPU Data",
-	/*  78 msg_c_report_offload                    */ "Offload",
-	/*  79 msg_c_report_w_tag                      */ "Tag %d",
-	/*  80 msg_c_report_state                      */ "State",
-	/*  81 msg_c_report_start                      */ "Start target",
-	/*  82 msg_c_report_init                       */ "Initialize",
-	/*  83 msg_c_report_logical_card               */ "logical card",
-	/*  84 msg_c_report_physical_card              */ "physical card",
-	/*  85 msg_c_report_register                   */ "Register static data tables",
-	/*  86 msg_c_report_init_func                  */ "Setup target entry",
-	/*  87 msg_c_report_create_buf_host            */ "Create host buffer",
-	/*  88 msg_c_report_create_buf_mic             */ "Create target buffer",
-	/*  89 msg_c_report_send_pointer_data          */ "Send pointer data",
-	/*  90 msg_c_report_sent_pointer_data          */ "Host->target pointer data",
-	/*  91 msg_c_report_gather_copyin_data         */ "Gather copyin data",
-	/*  92 msg_c_report_copyin_data                */ "Host->target copyin data",
-	/*  93 msg_c_report_state_signal               */ "Signal",
-	/*  94 msg_c_report_signal                     */ "signal :",
-	/*  95 msg_c_report_wait                       */ "waits  :",
-	/*  96 msg_c_report_compute                    */ "Execute task on target",
-	/*  97 msg_c_report_receive_pointer_data       */ "Receive pointer data",
-	/*  98 msg_c_report_received_pointer_data      */ "Target->host pointer data",
-	/*  99 msg_c_report_start_target_func          */ "Start target entry",
-	/* 100 msg_c_report_var                        */ "Var",
-	/* 101 msg_c_report_scatter_copyin_data        */ "Scatter copyin data",
-	/* 102 msg_c_report_gather_copyout_data        */ "Gather copyout data",
-	/* 103 msg_c_report_scatter_copyout_data       */ "Scatter copyout data",
-	/* 104 msg_c_report_copyout_data               */ "Target->host copyout data",
-	/* 105 msg_c_report_unregister                 */ "Unregister data tables",
-	/* 106 msg_c_report_destroy                    */ "Destroy",
-	/* 107 msg_c_report_myoinit                    */ "Initialize MYO",
-	/* 108 msg_c_report_myoregister                */ "Register MYO tables",
-	/* 109 msg_c_report_myofini                    */ "Finalize MYO",
-	/* 110 msg_c_report_mic_myo_shared             */ "MIC MYO shared table register",
-	/* 111 msg_c_report_mic_myo_fptr               */ "MIC MYO fptr table register",
-	/* 112 msg_c_report_myosharedmalloc            */ "MYO shared malloc",
-	/* 113 msg_c_report_myosharedfree              */ "MYO shared free",
-	/* 114 msg_c_report_myosharedalignedmalloc     */ "MYO shared aligned malloc",
-	/* 115 msg_c_report_myosharedalignedfree       */ "MYO shared aligned free",
-	/* 116 msg_c_report_myoacquire                 */ "MYO acquire",
-	/* 117 msg_c_report_myorelease                 */ "MYO release",
-	/* 118 msg_c_report_host_total_offload_time    */ "host: total offload time",
-	/* 119 msg_c_report_host_initialize            */ "host: initialize target",
-	/* 120 msg_c_report_host_target_acquire        */ "host: acquire target",
-	/* 121 msg_c_report_host_wait_deps             */ "host: wait dependencies",
-	/* 122 msg_c_report_host_setup_buffers         */ "host: setup buffers",
-	/* 123 msg_c_report_host_alloc_buffers         */ "host: allocate buffers",
-	/* 124 msg_c_report_host_setup_misc_data       */ "host: setup misc_data",
-	/* 125 msg_c_report_host_alloc_data_buffer     */ "host: allocate buffer",
-	/* 126 msg_c_report_host_send_pointers         */ "host: send pointers",
-	/* 127 msg_c_report_host_gather_inputs         */ "host: gather inputs",
-	/* 128 msg_c_report_host_map_in_data_buffer    */ "host: map IN data buffer",
-	/* 129 msg_c_report_host_unmap_in_data_buffer  */ "host: unmap IN data buffer",
-	/* 130 msg_c_report_host_start_compute         */ "host: initiate compute",
-	/* 131 msg_c_report_host_wait_compute          */ "host: wait compute",
-	/* 132 msg_c_report_host_start_buffers_reads   */ "host: initiate pointer reads",
-	/* 133 msg_c_report_host_scatter_outputs       */ "host: scatter outputs",
-	/* 134 msg_c_report_host_map_out_data_buffer   */ "host: map OUT data buffer",
-	/* 135 msg_c_report_host_unmap_out_data_buffer */ "host: unmap OUT data buffer",
-	/* 136 msg_c_report_host_wait_buffers_reads    */ "host: wait pointer reads",
-	/* 137 msg_c_report_host_destroy_buffers       */ "host: destroy buffers",
-	/* 138 msg_c_report_target_total_time          */ "target: total time",
-	/* 139 msg_c_report_target_descriptor_setup    */ "target: setup offload descriptor",
-	/* 140 msg_c_report_target_func_lookup         */ "target: entry lookup",
-	/* 141 msg_c_report_target_func_time           */ "target: entry time",
-	/* 142 msg_c_report_target_scatter_inputs      */ "target: scatter inputs",
-	/* 143 msg_c_report_target_add_buffer_refs     */ "target: add buffer reference",
-	/* 144 msg_c_report_target_compute             */ "target: compute",
-	/* 145 msg_c_report_target_gather_outputs      */ "target: gather outputs",
-	/* 146 msg_c_report_target_release_buffer_refs */ "target: remove buffer reference",
-	/* 147 msg_c_coi_pipeline_max_number           */ "number of host threads doing offload exceeds maximum of %d",
-	/* 148 msg_c_ranges_dont_match                 */ "ranges of source and destination don't match together",
-	/* 149 msg_c_destination_is_over               */ "insufficient destination memory to transfer source",
-	/* 150 msg_c_slice_of_noncont_array            */ "a non-contiguous slice may be taken of contiguous arrays only",
-	/* 151 msg_c_pointer_array_mismatch            */ "number of %s elements is less than described by the source",
-};
diff --git a/offload/src/mic_lib.f90 b/offload/src/mic_lib.f90
deleted file mode 100644
index 0c2e4de..0000000
--- a/offload/src/mic_lib.f90
+++ /dev/null
@@ -1,441 +0,0 @@
-!
-!//===----------------------------------------------------------------------===//
-!//
-!//                     The LLVM Compiler Infrastructure
-!//
-!// This file is dual licensed under the MIT and the University of Illinois Open
-!// Source Licenses. See LICENSE.txt for details.
-!//
-!//===----------------------------------------------------------------------===//
-!
-
-
-! **********************************************************************************
-! * This file is intended to support the Intel(r) Many Integrated Core Architecture.
-! **********************************************************************************
-! free form Fortran source - should be named .f90
-! lines are longer than 72 characters
-
-module mic_lib
-use, intrinsic :: iso_c_binding
-
-integer, parameter:: target_mic=2
-integer, parameter:: default_target_type=target_mic
-integer, parameter:: default_target_number=0
-
-enum, bind(C)
-    enumerator :: OFFLOAD_SUCCESS  = 0
-    enumerator :: OFFLOAD_DISABLED          ! offload is disabled
-    enumerator :: OFFLOAD_UNAVAILABLE       ! card is not available
-    enumerator :: OFFLOAD_OUT_OF_MEMORY     ! not enough memory on device
-    enumerator :: OFFLOAD_PROCESS_DIED      ! target process has died
-    enumerator :: OFFLOAD_ERROR             ! unspecified error
-end enum
-
-type, bind (C) :: offload_status
-    integer(kind=c_int)    ::  result          = OFFLOAD_DISABLED
-    integer(kind=c_int)    ::  device_number   = -1
-    integer(kind=c_size_t) ::  data_sent       = 0
-    integer(kind=c_size_t) ::  data_received   = 0
-end type offload_status
-
-interface
-function offload_number_of_devices ()                                  &
-           bind (C, name = "_Offload_number_of_devices")
-!dec$ attributes default :: offload_number_of_devices
-  import :: c_int
-  integer (kind=c_int)        :: offload_number_of_devices
-!dec$ attributes offload:mic :: offload_number_of_devices
-!dir$ attributes known_intrinsic ::  offload_number_of_devices
-end function offload_number_of_devices
-
-function offload_signaled(target_number, signal)                       &
-           bind (C, name = "_Offload_signaled")
-!dec$ attributes default :: offload_signaled
-  import :: c_int, c_int64_t
-  integer (kind=c_int) :: offload_signaled
-  integer (kind=c_int), value :: target_number
-  integer (kind=c_int64_t), value :: signal
-!dec$ attributes offload:mic :: offload_signaled
-end function offload_signaled
-
-subroutine offload_report(val)                                         &
-           bind (C, name = "_Offload_report")
-!dec$ attributes default :: offload_report
-  import :: c_int
-  integer (kind=c_int), value :: val
-!dec$ attributes offload:mic :: offload_report
-end subroutine offload_report
-
-function offload_get_device_number()                                   &
-           bind (C, name = "_Offload_get_device_number")
-!dec$ attributes default :: offload_get_device_number
-  import :: c_int
-  integer (kind=c_int)        :: offload_get_device_number
-!dec$ attributes offload:mic :: offload_get_device_number
-end function offload_get_device_number
-
-function offload_get_physical_device_number()                          &
-           bind (C, name = "_Offload_get_physical_device_number")
-!dec$ attributes default :: offload_get_physical_device_number
-  import :: c_int
-  integer (kind=c_int)        :: offload_get_physical_device_number
-!dec$ attributes offload:mic :: offload_get_physical_device_number
-end function offload_get_physical_device_number
-
-! OpenMP API wrappers
-
-subroutine omp_set_num_threads_target (target_type,                    &
-                                       target_number,                  &
-                                       num_threads)                    &
-           bind (C, name = "omp_set_num_threads_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number, num_threads
-end subroutine omp_set_num_threads_target
-
-function omp_get_max_threads_target (target_type,                      &
-                                     target_number)                    &
-         bind (C, name = "omp_get_max_threads_target")
-  import :: c_int
-  integer (kind=c_int)        :: omp_get_max_threads_target
-  integer (kind=c_int), value :: target_type, target_number
-end function omp_get_max_threads_target
-
-function omp_get_num_procs_target (target_type,                        &
-                                   target_number)                      &
-         bind (C, name = "omp_get_num_procs_target")
-  import :: c_int
-  integer (kind=c_int)        :: omp_get_num_procs_target
-  integer (kind=c_int), value :: target_type, target_number
-end function omp_get_num_procs_target
-
-subroutine omp_set_dynamic_target (target_type,                        &
-                                   target_number,                      &
-                                   num_threads)                        &
-           bind (C, name = "omp_set_dynamic_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number, num_threads
-end subroutine omp_set_dynamic_target
-
-function omp_get_dynamic_target (target_type,                          &
-                                 target_number)                        &
-         bind (C, name = "omp_get_dynamic_target")
-  import :: c_int
-  integer (kind=c_int)        :: omp_get_dynamic_target
-  integer (kind=c_int), value :: target_type, target_number
-end function omp_get_dynamic_target
-
-subroutine omp_set_nested_target (target_type,                         &
-                                  target_number,                       &
-                                  nested)                              &
-           bind (C, name = "omp_set_nested_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number, nested
-end subroutine omp_set_nested_target
-
-function omp_get_nested_target (target_type,                           &
-                                target_number)                         &
-         bind (C, name = "omp_get_nested_target")
-  import :: c_int
-  integer (kind=c_int)        :: omp_get_nested_target
-  integer (kind=c_int), value :: target_type, target_number
-end function omp_get_nested_target
-
-subroutine omp_set_schedule_target (target_type,                       &
-                                    target_number,                     &
-                                    kind,                              &
-                                    modifier)                          &
-           bind (C, name = "omp_set_schedule_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number, kind, modifier
-end subroutine omp_set_schedule_target
-
-subroutine omp_get_schedule_target (target_type,                       &
-                                    target_number,                     &
-                                    kind,                              &
-                                    modifier)                          &
-           bind (C, name = "omp_get_schedule_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: kind, modifier
-end subroutine omp_get_schedule_target
-
-! lock API functions
-
-subroutine omp_init_lock_target (target_type,                          &
-                                 target_number,                        &
-                                 lock)                                 &
-           bind (C, name = "omp_init_lock_target")
-  import :: c_int, c_intptr_t
-  !dir$ attributes known_intrinsic ::  omp_init_lock_target
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end subroutine omp_init_lock_target
-
-subroutine omp_destroy_lock_target (target_type,                       &
-                                    target_number,                     &
-                                    lock)                              &
-           bind (C, name = "omp_destroy_lock_target")
-  import :: c_int, c_intptr_t
-  !dir$ attributes known_intrinsic ::  omp_destroy_lock_target
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end subroutine omp_destroy_lock_target
-
-subroutine omp_set_lock_target (target_type,                           &
-                                target_number,                         &
-                                lock)                                  &
-           bind (C, name = "omp_set_lock_target")
-  import :: c_int, c_intptr_t
-  !dir$ attributes known_intrinsic ::  omp_set_lock_target
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end subroutine omp_set_lock_target
-
-subroutine omp_unset_lock_target (target_type,                         &
-                                  target_number,                       &
-                                  lock)                                &
-           bind (C, name = "omp_unset_lock_target")
-  import :: c_int, c_intptr_t
-  !dir$ attributes known_intrinsic ::  omp_unset_lock_target
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end subroutine omp_unset_lock_target
-
-function omp_test_lock_target (target_type,                            &
-                               target_number,                          &
-                               lock)                                   &
-           bind (C, name = "omp_test_lock_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int)        :: omp_test_lock_target
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end function omp_test_lock_target
-
-! nested lock API functions
-
-subroutine omp_init_nest_lock_target (target_type,                     &
-                                      target_number,                   &
-                                      lock)                            &
-           bind (C, name = "omp_init_nest_lock_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end subroutine omp_init_nest_lock_target
-
-subroutine omp_destroy_nest_lock_target (target_type,                  &
-                                         target_number,                &
-                                         lock)                         &
-           bind (C, name = "omp_destroy_nest_lock_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end subroutine omp_destroy_nest_lock_target
-
-subroutine omp_set_nest_lock_target (target_type,                      &
-                                     target_number,                    &
-                                     lock)                             &
-           bind (C, name = "omp_set_nest_lock_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end subroutine omp_set_nest_lock_target
-
-subroutine omp_unset_nest_lock_target (target_type,                    &
-                                       target_number,                  &
-                                       lock)                           &
-           bind (C, name = "omp_unset_nest_lock_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end subroutine omp_unset_nest_lock_target
-
-function omp_test_nest_lock_target (target_type,                       &
-                                    target_number,                     &
-                                    lock)                              &
-           bind (C, name = "omp_test_nest_lock_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int)        :: omp_test_nest_lock_target
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end function omp_test_nest_lock_target
-
-! kmp API functions
-
-subroutine kmp_set_stacksize_target (target_type,                      &
-                                     target_number,                    &
-                                     size)                             &
-           bind (C, name = "kmp_set_stacksize_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number, size
-end subroutine kmp_set_stacksize_target
-
-function kmp_get_stacksize_target (target_type,                        &
-                                   target_number)                      &
-         bind (C, name = "kmp_get_stacksize_target")
-  import :: c_int
-  integer (kind=c_int)        :: kmp_get_stacksize_target
-  integer (kind=c_int), value :: target_type, target_number
-end function kmp_get_stacksize_target
-
-subroutine kmp_set_stacksize_s_target (target_type,                    &
-                                       target_number,                  &
-                                       size)                           &
-           bind (C, name = "kmp_set_stacksize_s_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number, size
-end subroutine kmp_set_stacksize_s_target
-
-function kmp_get_stacksize_s_target (target_type,                      &
-                                     target_number)                    &
-         bind (C, name = "kmp_get_stacksize_s_target")
-  import :: c_int
-  integer (kind=c_int)        :: kmp_get_stacksize_s_target
-  integer (kind=c_int), value :: target_type, target_number
-end function kmp_get_stacksize_s_target
-
-subroutine kmp_set_blocktime_target (target_type,                      &
-                                     target_number,                    &
-                                     time)                             &
-           bind (C, name = "kmp_set_blocktime_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number, time
-end subroutine kmp_set_blocktime_target
-
-function kmp_get_blocktime_target (target_type,                        &
-                                   target_number)                      &
-         bind (C, name = "kmp_get_blocktime_target")
-  import :: c_int
-  integer (kind=c_int)        :: kmp_get_blocktime_target
-  integer (kind=c_int), value :: target_type, target_number
-end function kmp_get_blocktime_target
-
-subroutine kmp_set_library_serial_target (target_type,                 &
-                                          target_number)               &
-           bind (C, name = "kmp_set_library_serial_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number
-end subroutine kmp_set_library_serial_target
-
-subroutine kmp_set_library_turnaround_target (target_type,             &
-                                              target_number)           &
-           bind (C, name = "kmp_set_library_turnaround_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number
-end subroutine kmp_set_library_turnaround_target
-
-subroutine kmp_set_library_throughput_target (target_type,             &
-                                              target_number)           &
-           bind (C, name = "kmp_set_library_throughput_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number
-end subroutine kmp_set_library_throughput_target
-
-subroutine kmp_set_library_target (target_type,                        &
-                                   target_number,                      &
-                                   mode)                               &
-           bind (C, name = "kmp_set_library_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number, mode
-end subroutine kmp_set_library_target
-
-function kmp_get_library_target (target_type,                          &
-                                 target_number)                        &
-         bind (C, name = "kmp_get_library_target")
-  import :: c_int
-  integer (kind=c_int)        :: kmp_get_library_target
-  integer (kind=c_int), value :: target_type, target_number
-end function kmp_get_library_target
-
-subroutine kmp_set_defaults_target (target_type,                       &
-                                    target_number,                     &
-                                    defaults)                          &
-           bind (C, name = "kmp_set_defaults_target")
-  import :: c_int, c_char
- character (kind=c_char) :: defaults(*)
- integer (kind=c_int), value :: target_type, target_number
-end subroutine kmp_set_defaults_target
-
-! affinity API functions
-
-subroutine kmp_create_affinity_mask_target (target_type,               &
-                                            target_number,             &
-                                            mask)                      &
-           bind (C, name = "kmp_create_affinity_mask_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: mask
-end subroutine kmp_create_affinity_mask_target
-
-subroutine kmp_destroy_affinity_mask_target (target_type,              &
-                                             target_number,            &
-                                             mask)                     &
-           bind (C, name = "kmp_destroy_affinity_mask_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: mask
-end subroutine kmp_destroy_affinity_mask_target
-
-function kmp_set_affinity_target (target_type,                         &
-                                  target_number,                       &
-                                  mask)                                &
-           bind (C, name = "kmp_set_affinity_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int)        :: kmp_set_affinity_target
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: mask
-end function kmp_set_affinity_target
-
-function kmp_get_affinity_target (target_type,                         &
-                                  target_number,                       &
-                                  mask)                                &
-           bind (C, name = "kmp_get_affinity_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int)        :: kmp_get_affinity_target
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: mask
-end function kmp_get_affinity_target
-
-function kmp_get_affinity_max_proc_target (target_type,                &
-                                           target_number)              &
-           bind (C, name = "kmp_get_affinity_max_proc_target")
-  import :: c_int
-  integer (kind=c_int)        :: kmp_get_affinity_max_proc_target
-  integer (kind=c_int), value :: target_type, target_number
-end function kmp_get_affinity_max_proc_target
-
-function kmp_set_affinity_mask_proc_target (target_type,               &
-                                            target_number,             &
-                                            proc,                      &
-                                            mask)                      &
-           bind (C, name = "kmp_set_affinity_mask_proc_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int)        :: kmp_set_affinity_mask_proc_target
-  integer (kind=c_int), value :: target_type, target_number, proc
-  integer (kind=c_intptr_t), value :: mask
-end function kmp_set_affinity_mask_proc_target
-
-function kmp_unset_affinity_mask_proc_target (target_type,             &
-                                              target_number,           &
-                                              proc,                    &
-                                              mask)                    &
-           bind (C, name = "kmp_unset_affinity_mask_proc_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int)        :: kmp_unset_affinity_mask_proc_target
-  integer (kind=c_int), value :: target_type, target_number, proc
-  integer (kind=c_intptr_t), value :: mask
-end function kmp_unset_affinity_mask_proc_target
-
-function kmp_get_affinity_mask_proc_target (target_type,               &
-                                            target_number,             &
-                                            proc,                      &
-                                            mask)                      &
-           bind (C, name = "kmp_get_affinity_mask_proc_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int)        :: kmp_get_affinity_mask_proc_target
-  integer (kind=c_int), value :: target_type, target_number, proc
-  integer (kind=c_intptr_t), value :: mask
-end function kmp_get_affinity_mask_proc_target
-
-end interface
-end module mic_lib
diff --git a/offload/src/offload.h b/offload/src/offload.h
deleted file mode 100644
index 68914b7..0000000
--- a/offload/src/offload.h
+++ /dev/null
@@ -1,474 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-/*
- * Include file for Offload API.
- */
-
-#ifndef OFFLOAD_H_INCLUDED
-#define OFFLOAD_H_INCLUDED
-
-#if defined(LINUX) || defined(FREEBSD)
-#include <bits/functexcept.h>
-#endif
-
-#include <stddef.h>
-#include <omp.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define TARGET_ATTRIBUTE __declspec(target(mic))
-
-/*
- *  The target architecture.
- */
-typedef enum TARGET_TYPE {
-    TARGET_NONE,    /* Undefine target */
-    TARGET_HOST,    /* Host used as target */
-    TARGET_MIC      /* MIC target */
-} TARGET_TYPE;
-
-/*
- *  The default target type.
- */
-#define DEFAULT_TARGET_TYPE TARGET_MIC
-
-/*
- *  The default target number.
- */
-#define DEFAULT_TARGET_NUMBER 0
-
-/*
- *  Offload status.
- */
-typedef enum {
-    OFFLOAD_SUCCESS = 0,
-    OFFLOAD_DISABLED,               /* offload is disabled */
-    OFFLOAD_UNAVAILABLE,            /* card is not available */
-    OFFLOAD_OUT_OF_MEMORY,          /* not enough memory on device */
-    OFFLOAD_PROCESS_DIED,           /* target process has died */
-    OFFLOAD_ERROR                   /* unspecified error */
-} _Offload_result;
-
-typedef struct {
-    _Offload_result result;         /* result, see above */
-    int             device_number;  /* device number */
-    size_t          data_sent;      /* number of bytes sent to the target */
-    size_t          data_received;  /* number of bytes received by host */
-} _Offload_status;
-
-#define OFFLOAD_STATUS_INIT(x) \
-    ((x).result = OFFLOAD_DISABLED)
-
-#define OFFLOAD_STATUS_INITIALIZER \
-    { OFFLOAD_DISABLED, -1, 0, 0 }
-
-/* Offload runtime interfaces */
-
-extern int _Offload_number_of_devices(void);
-extern int _Offload_get_device_number(void);
-extern int _Offload_get_physical_device_number(void);
-
-extern void* _Offload_shared_malloc(size_t size);
-extern void  _Offload_shared_free(void *ptr);
-
-extern void* _Offload_shared_aligned_malloc(size_t size, size_t align);
-extern void  _Offload_shared_aligned_free(void *ptr);
-
-extern int _Offload_signaled(int index, void *signal);
-extern void _Offload_report(int val);
-
-/* OpenMP API */
-
-extern void omp_set_default_device(int num);
-extern int  omp_get_default_device(void);
-extern int  omp_get_num_devices(void);
-
-/* OpenMP API wrappers */
-
-/* Set num_threads on target */
-extern void omp_set_num_threads_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-);
-
-/* Get max_threads from target */
-extern int omp_get_max_threads_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-/* Get num_procs from target */
-extern int omp_get_num_procs_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-/* Set dynamic on target */
-extern void omp_set_dynamic_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-);
-
-/* Get dynamic from target */
-extern int omp_get_dynamic_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-/* Set nested on target */
-extern void omp_set_nested_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int nested
-);
-
-/* Get nested from target */
-extern int omp_get_nested_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void omp_set_num_threads_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-);
-
-extern int omp_get_max_threads_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern int omp_get_num_procs_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void omp_set_dynamic_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-);
-
-extern int omp_get_dynamic_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void omp_set_nested_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-);
-
-extern int omp_get_nested_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void omp_set_schedule_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_sched_t kind,
-    int modifier
-);
-
-extern void omp_get_schedule_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_sched_t *kind,
-    int *modifier
-);
-
-/* lock API functions */
-
-typedef struct {
-    omp_lock_t lock;
-} omp_lock_target_t;
-
-extern void omp_init_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-);
-
-extern void omp_destroy_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-);
-
-extern void omp_set_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-);
-
-extern void omp_unset_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-);
-
-extern int omp_test_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-);
-
-/* nested lock API functions */
-
-typedef struct {
-    omp_nest_lock_t lock;
-} omp_nest_lock_target_t;
-
-extern void omp_init_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-);
-
-extern void omp_destroy_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-);
-
-extern void omp_set_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-);
-
-extern void omp_unset_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-);
-
-extern int omp_test_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-);
-
-/* kmp API functions */
-
-extern void kmp_set_stacksize_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int size
-);
-
-extern int kmp_get_stacksize_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void kmp_set_stacksize_s_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    size_t size
-);
-
-extern size_t kmp_get_stacksize_s_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void kmp_set_blocktime_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int time
-);
-
-extern int kmp_get_blocktime_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void kmp_set_library_serial_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void kmp_set_library_turnaround_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void kmp_set_library_throughput_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void kmp_set_library_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int mode
-);
-
-extern int kmp_get_library_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void kmp_set_defaults_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    char const *defaults
-);
-
-/* affinity API functions */
-
-typedef struct {
-    kmp_affinity_mask_t mask;
-} kmp_affinity_mask_target_t;
-
-extern void kmp_create_affinity_mask_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-);
-
-extern void kmp_destroy_affinity_mask_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-);
-
-extern int kmp_set_affinity_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-);
-
-extern int kmp_get_affinity_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-);
-
-extern int kmp_get_affinity_max_proc_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern int kmp_set_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-);
-
-extern int kmp_unset_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-);
-
-extern int kmp_get_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-);
-
-#ifdef __cplusplus
-} /* extern "C" */
-
-/* Namespace for the shared_allocator. */
-namespace __offload {
-  /* This follows the specification for std::allocator. */
-  /* Forward declaration of the class template. */
-  template <typename T>
-  class shared_allocator;
-
-  /* Specialization for shared_allocator<void>. */
-  template <>
-  class shared_allocator<void> {
-  public:
-    typedef void       *pointer;
-    typedef const void *const_pointer;
-    typedef void        value_type;
-    template <class U> struct rebind { typedef shared_allocator<U> other; };
-  };
-
-  /* Definition of shared_allocator<T>. */
-  template <class T>
-  class shared_allocator {
-  public:
-    typedef size_t     size_type;
-    typedef ptrdiff_t  difference_type;
-    typedef T         *pointer;
-    typedef const T   *const_pointer;
-    typedef T         &reference;
-    typedef const T   &const_reference;
-    typedef T          value_type;
-    template <class U> struct rebind { typedef shared_allocator<U> other; };
-    shared_allocator() throw() { }
-    shared_allocator(const shared_allocator&) throw() { }
-    template <class U> shared_allocator(const shared_allocator<U>&) throw() { }
-    ~shared_allocator() throw() { }
-    pointer address(reference x) const { return &x; }
-    const_pointer address(const_reference x) const { return &x; }
-    pointer allocate(
-      size_type, shared_allocator<void>::const_pointer hint = 0);
-    void deallocate(pointer p, size_type n);
-    size_type max_size() const throw() {
-      return size_type(-1)/sizeof(T);
-    } /* max_size */
-    void construct(pointer p, const T& arg) {
-      ::new (p) T(arg);
-    } /* construct */
-    void destroy(pointer p) {
-      p->~T();
-    } /* destroy */
-  };
-
-  /* Definition for allocate. */
-  template <class T>
-  typename shared_allocator<T>::pointer
-  shared_allocator<T>::allocate(shared_allocator<T>::size_type s,
-                                shared_allocator<void>::const_pointer) {
-    /* Allocate from shared memory. */
-    void *ptr = _Offload_shared_malloc(s*sizeof(T));
-    if (ptr == 0) std::__throw_bad_alloc();
-    return static_cast<pointer>(ptr);
-  } /* allocate */
-
-  template <class T>
-  void shared_allocator<T>::deallocate(pointer p,
-                                       shared_allocator<T>::size_type) {
-    /* Free the shared memory. */
-    _Offload_shared_free(p);
-  } /* deallocate */
-
-  template <typename _T1, typename _T2>
-  inline bool operator==(const shared_allocator<_T1> &, 
-                         const shared_allocator<_T2> &) throw() {
-    return true;
-  }  /* operator== */
-
-  template <typename _T1, typename _T2>
-  inline bool operator!=(const shared_allocator<_T1> &, 
-                         const shared_allocator<_T2> &) throw() {
-    return false;
-  }  /* operator!= */
-}  /* __offload */
-#endif /* __cplusplus */
-
-#endif /* OFFLOAD_H_INCLUDED */
diff --git a/offload/src/offload_common.cpp b/offload/src/offload_common.cpp
deleted file mode 100644
index 3681b06..0000000
--- a/offload/src/offload_common.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#if defined(LINUX) || defined(FREEBSD)
-#include <mm_malloc.h>
-#endif
-
-#include "offload_common.h"
-
-// The debug routines
-
-#if OFFLOAD_DEBUG > 0
-
-void __dump_bytes(
-    int trace_level,
-    const void *data,
-    int len
-)
-{
-    if (console_enabled > trace_level) {
-        const uint8_t *arr = (const uint8_t*) data;
-        char buffer[4096];
-        char *bufferp;
-        int count = 0;
-
-        bufferp = buffer;
-        while (len--) {
-            sprintf(bufferp, "%02x", *arr++);
-            bufferp += 2;
-            count++;
-            if ((count&3) == 0) {
-                sprintf(bufferp, " ");
-                bufferp++;
-            }
-            if ((count&63) == 0) {
-                OFFLOAD_DEBUG_TRACE(trace_level, "%s\n", buffer);
-                bufferp = buffer;
-                count = 0;
-            }
-        }
-        if (count) {
-            OFFLOAD_DEBUG_TRACE(trace_level, "%s\n", buffer);
-        }
-    }
-}
-#endif // OFFLOAD_DEBUG
-
-// The Marshaller and associated routines
-
-void Marshaller::send_data(
-    const void *data,
-    int64_t length
-)
-{
-    OFFLOAD_DEBUG_TRACE(2, "send_data(%p, %lld)\n",
-                        data, length);
-    memcpy(buffer_ptr, data, (size_t)length);
-    buffer_ptr += length;
-    tfr_size += length;
-}
-
-void Marshaller::receive_data(
-    void *data,
-    int64_t length
-)
-{
-    OFFLOAD_DEBUG_TRACE(2, "receive_data(%p, %lld)\n",
-                        data, length);
-    memcpy(data, buffer_ptr, (size_t)length);
-    buffer_ptr += length;
-    tfr_size += length;
-}
-
-// Send function pointer
-void Marshaller::send_func_ptr(
-    const void* data
-)
-{
-    const char* name;
-    size_t      length;
-
-    if (data != 0) {
-        name = __offload_funcs.find_name(data);
-        if (name == 0) {
-#if OFFLOAD_DEBUG > 0
-            if (console_enabled > 2) {
-                __offload_funcs.dump();
-            }
-#endif // OFFLOAD_DEBUG > 0
-
-            LIBOFFLOAD_ERROR(c_send_func_ptr, data);
-            exit(1);
-        }
-        length = strlen(name) + 1;
-    }
-    else {
-        name = "";
-        length = 1;
-    }
-
-    memcpy(buffer_ptr, name, length);
-    buffer_ptr += length;
-    tfr_size += length;
-}
-
-// Receive function pointer
-void Marshaller::receive_func_ptr(
-    const void** data
-)
-{
-    const char* name;
-    size_t      length;
-
-    name = (const char*) buffer_ptr;
-    if (name[0] != '\0') {
-        *data = __offload_funcs.find_addr(name);
-        if (*data == 0) {
-#if OFFLOAD_DEBUG > 0
-            if (console_enabled > 2) {
-                __offload_funcs.dump();
-            }
-#endif // OFFLOAD_DEBUG > 0
-
-            LIBOFFLOAD_ERROR(c_receive_func_ptr, name);
-            exit(1);
-        }
-        length = strlen(name) + 1;
-    }
-    else {
-        *data = 0;
-        length = 1;
-    }
-
-    buffer_ptr += length;
-    tfr_size += length;
-}
-
-// End of the Marshaller and associated routines
-
-extern void *OFFLOAD_MALLOC(
-    size_t size,
-    size_t align
-)
-{
-    void *ptr;
-    int   err;
-
-    OFFLOAD_DEBUG_TRACE(2, "%s(%lld, %lld)\n", __func__, size, align);
-
-    if (align < sizeof(void*)) {
-        align = sizeof(void*);
-    }
-
-    ptr = _mm_malloc(size, align);
-    if (ptr == NULL) {
-        LIBOFFLOAD_ERROR(c_offload_malloc, size, align);
-        exit(1);
-    }
-
-    OFFLOAD_DEBUG_TRACE(2, "%s returned %p\n", __func__, ptr);
-
-    return ptr;
-}
diff --git a/offload/src/offload_common.h b/offload/src/offload_common.h
deleted file mode 100644
index 11cb8bb..0000000
--- a/offload/src/offload_common.h
+++ /dev/null
@@ -1,444 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-/*! \file
-    \brief The parts of the runtime library common to host and target
-*/
-
-#ifndef OFFLOAD_COMMON_H_INCLUDED
-#define OFFLOAD_COMMON_H_INCLUDED
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <memory.h>
-
-#include "offload.h"
-#include "offload_table.h"
-#include "offload_trace.h"
-#include "offload_timer.h"
-#include "offload_util.h"
-#include "cean_util.h"
-#include "dv_util.h"
-#include "liboffload_error_codes.h"
-
-#include <stdarg.h>
-
-// The debug routines
-
-// Host console and file logging
-extern int console_enabled;
-extern int offload_report_level;
-
-#define OFFLOAD_DO_TRACE (offload_report_level == 3)
-
-extern const char *prefix;
-extern int offload_number;
-#if !HOST_LIBRARY
-extern int mic_index;
-#endif
-
-#if HOST_LIBRARY
-void Offload_Report_Prolog(OffloadHostTimerData* timer_data);
-void Offload_Report_Epilog(OffloadHostTimerData* timer_data);
-void offload_report_free_data(OffloadHostTimerData * timer_data);
-void Offload_Timer_Print(void);
-
-#ifndef TARGET_WINNT
-#define OFFLOAD_DEBUG_INCR_OFLD_NUM() \
-        __sync_add_and_fetch(&offload_number, 1)
-#else
-#define OFFLOAD_DEBUG_INCR_OFLD_NUM() \
-        _InterlockedIncrement(reinterpret_cast<long*>(&offload_number))
-#endif
-
-#define OFFLOAD_DEBUG_PRINT_TAG_PREFIX() \
-        printf("%s:  ", prefix);
-
-#define OFFLOAD_DEBUG_PRINT_PREFIX() \
-        printf("%s:  ", prefix);
-#else
-#define OFFLOAD_DEBUG_PRINT_PREFIX() \
-        printf("%s%d:  ", prefix, mic_index);
-#endif // HOST_LIBRARY
-
-#define OFFLOAD_TRACE(trace_level, ...)  \
-    if (console_enabled >= trace_level) { \
-        OFFLOAD_DEBUG_PRINT_PREFIX(); \
-        printf(__VA_ARGS__); \
-        fflush(NULL); \
-    }
-
-#if OFFLOAD_DEBUG > 0
-
-#define OFFLOAD_DEBUG_TRACE(level, ...) \
-    OFFLOAD_TRACE(level, __VA_ARGS__)
-
-#define OFFLOAD_REPORT(level, offload_number, stage, ...) \
-    if (OFFLOAD_DO_TRACE) { \
-        offload_stage_print(stage, offload_number, __VA_ARGS__); \
-        fflush(NULL); \
-    }
-
-#define OFFLOAD_DEBUG_TRACE_1(level, offload_number, stage, ...) \
-    if (OFFLOAD_DO_TRACE) { \
-        offload_stage_print(stage, offload_number, __VA_ARGS__); \
-        fflush(NULL); \
-    } \
-    if (!OFFLOAD_DO_TRACE) { \
-        OFFLOAD_TRACE(level, __VA_ARGS__) \
-    }
-
-#define OFFLOAD_DEBUG_DUMP_BYTES(level, a, b) \
-    __dump_bytes(level, a, b)
-
-extern void __dump_bytes(
-    int level,
-    const void *data,
-    int len
-);
-
-#else
-
-#define OFFLOAD_DEBUG_LOG(level, ...)
-#define OFFLOAD_DEBUG_DUMP_BYTES(level, a, b)
-
-#endif
-
-// Runtime interface
-
-#define OFFLOAD_PREFIX(a) __offload_##a
-
-#define OFFLOAD_MALLOC            OFFLOAD_PREFIX(malloc)
-#define OFFLOAD_FREE(a)           _mm_free(a)
-
-// Forward functions
-
-extern void *OFFLOAD_MALLOC(size_t size, size_t align);
-
-// The Marshaller
-
-//! \enum Indicator for the type of entry on an offload item list.
-enum OffloadItemType {
-    c_data =   1,       //!< Plain data
-    c_data_ptr,         //!< Pointer data
-    c_func_ptr,         //!< Function pointer
-    c_void_ptr,         //!< void*
-    c_string_ptr,       //!< C string
-    c_dv,               //!< Dope vector variable
-    c_dv_data,          //!< Dope-vector data
-    c_dv_data_slice,    //!< Dope-vector data's slice
-    c_dv_ptr,           //!< Dope-vector variable pointer
-    c_dv_ptr_data,      //!< Dope-vector pointer data
-    c_dv_ptr_data_slice,//!< Dope-vector pointer data's slice
-    c_cean_var,         //!< CEAN variable
-    c_cean_var_ptr,     //!< Pointer to CEAN variable
-    c_data_ptr_array,   //!< Pointer to data pointer array
-    c_func_ptr_array,   //!< Pointer to function pointer array
-    c_void_ptr_array,   //!< Pointer to void* pointer array
-    c_string_ptr_array  //!< Pointer to char* pointer array
-};
-
-#define VAR_TYPE_IS_PTR(t) ((t) == c_string_ptr || \
-                            (t) == c_data_ptr || \
-                            (t) == c_cean_var_ptr || \
-                            (t) == c_dv_ptr)
-
-#define VAR_TYPE_IS_SCALAR(t) ((t) == c_data || \
-                               (t) == c_void_ptr || \
-                               (t) == c_cean_var || \
-                               (t) == c_dv)
-
-#define VAR_TYPE_IS_DV_DATA(t) ((t) == c_dv_data || \
-                                (t) == c_dv_ptr_data)
-
-#define VAR_TYPE_IS_DV_DATA_SLICE(t) ((t) == c_dv_data_slice || \
-                                      (t) == c_dv_ptr_data_slice)
-
-
-//! \enum Specify direction to copy offloaded variable.
-enum OffloadParameterType {
-    c_parameter_unknown = -1, //!< Unknown clause
-    c_parameter_nocopy,       //!< Variable listed in "nocopy" clause
-    c_parameter_in,           //!< Variable listed in "in" clause
-    c_parameter_out,          //!< Variable listed in "out" clause
-    c_parameter_inout         //!< Variable listed in "inout" clause
-};
-
-//! An Offload Variable descriptor
-struct VarDesc {
-    //! OffloadItemTypes of source and destination
-    union {
-        struct {
-            uint8_t dst : 4; //!< OffloadItemType of destination
-            uint8_t src : 4; //!< OffloadItemType of source
-        };
-        uint8_t bits;
-    } type;
-
-    //! OffloadParameterType that describes direction of data transfer
-    union {
-        struct {
-            uint8_t in  : 1; //!< Set if IN or INOUT
-            uint8_t out : 1; //!< Set if OUT or INOUT
-        };
-        uint8_t bits;
-    } direction;
-
-    uint8_t alloc_if;        //!< alloc_if modifier value
-    uint8_t free_if;         //!< free_if modifier value
-    uint32_t align;          //!< MIC alignment requested for pointer data
-    //! Not used by compiler; set to 0
-    /*! Used by runtime as offset to data from start of MIC buffer */
-    uint32_t mic_offset;
-    //! Flags describing this variable
-    union {
-        struct {
-            //! source variable has persistent storage
-            uint32_t is_static : 1;
-            //! destination variable has persistent storage
-            uint32_t is_static_dstn : 1;
-            //! has length for c_dv && c_dv_ptr
-            uint32_t has_length : 1;
-            //! persisted local scalar is in stack buffer
-            uint32_t is_stack_buf : 1;
-            //! buffer address is sent in data
-            uint32_t sink_addr : 1;
-            //! alloc displacement is sent in data
-            uint32_t alloc_disp : 1;
-            //! source data is noncontiguous
-            uint32_t is_noncont_src : 1;
-            //! destination data is noncontiguous
-            uint32_t is_noncont_dst : 1;
-        };
-        uint32_t bits;
-    } flags;
-    //! Not used by compiler; set to 0
-    /*! Used by runtime as offset to base from data stored in a buffer */
-    int64_t offset;
-    //! Element byte-size of data to be transferred
-    /*! For dope-vector, the size of the dope-vector      */
-    int64_t size;
-    union {
-        //! Set to 0 for array expressions and dope-vectors
-        /*! Set to 1 for scalars                          */
-        /*! Set to value of length modifier for pointers  */
-        int64_t count;
-        //! Displacement not used by compiler
-        int64_t disp;
-    };
-
-    //! This field not used by OpenMP 4.0
-    /*! The alloc section expression in #pragma offload   */
-    union {
-       void *alloc;
-       int64_t ptr_arr_offset;
-    };
-
-    //! This field not used by OpenMP 4.0
-    /*! The into section expression in #pragma offload    */
-    /*! For c_data_ptr_array this is the into ptr array   */
-    void *into;
-
-    //! For an ordinary variable, address of the variable
-    /*! For c_cean_var (C/C++ array expression),
-        pointer to arr_desc, which is an array descriptor. */
-    /*! For c_data_ptr_array (array of data pointers),
-        pointer to ptr_array_descriptor,
-        which is a descriptor for pointer array transfers. */
-    void *ptr;
-};
-
-//! Auxiliary struct used when -g is enabled that holds variable names
-struct VarDesc2 {
-    const char *sname; //!< Source name
-    const char *dname; //!< Destination name (when "into" is used)
-};
-
-/*! When the OffloadItemType is c_data_ptr_array
-    the ptr field of the main descriptor points to this struct.          */
-/*! The type in VarDesc1 merely says c_cean_data_ptr, but the pointer
-    type can be c_data_ptr, c_func_ptr, c_void_ptr, or c_string_ptr.
-    Therefore the actual pointer type is in the flags field of VarDesc3. */
-/*! If flag_align_is_array/flag_alloc_if_is_array/flag_free_if_is_array
-    is 0 then alignment/alloc_if/free_if are specified in VarDesc1.      */
-/*! If flag_align_is_array/flag_alloc_if_is_array/flag_free_if_is_array
-    is 1 then align_array/alloc_if_array/free_if_array specify
-    the set of alignment/alloc_if/free_if values.                        */
-/*! For the other fields, if neither the scalar nor the array flag
-    is set, then that modifier was not specified. If the bits are set
-    they specify which modifier was set and whether it was a
-    scalar or an array expression.                                       */
-struct VarDesc3
-{
-    void *ptr_array;        //!< Pointer to arr_desc of array of pointers
-    void *align_array;      //!< Scalar value or pointer to arr_desc
-    void *alloc_if_array;   //!< Scalar value or pointer to arr_desc
-    void *free_if_array;    //!< Scalar value or pointer to arr_desc
-    void *extent_start;     //!< Scalar value or pointer to arr_desc
-    void *extent_elements;  //!< Scalar value or pointer to arr_desc
-    void *into_start;       //!< Scalar value or pointer to arr_desc
-    void *into_elements;    //!< Scalar value or pointer to arr_desc
-    void *alloc_start;      //!< Scalar value or pointer to arr_desc
-    void *alloc_elements;   //!< Scalar value or pointer to arr_desc
-    /*! Flags that describe the pointer type and whether each field
-        is a scalar value or an array expression.        */
-    /*! First 6 bits are pointer array element type:
-        c_data_ptr, c_func_ptr, c_void_ptr, c_string_ptr */
-    /*! Then single bits specify:                        */
-    /*!     align_array is an array                      */
-    /*!     alloc_if_array is an array                   */
-    /*!     free_if_array is an array                    */
-    /*!     extent_start is a scalar expression          */
-    /*!     extent_start is an array expression          */
-    /*!     extent_elements is a scalar expression       */
-    /*!     extent_elements is an array expression       */
-    /*!     into_start is a scalar expression            */
-    /*!     into_start is an array expression            */
-    /*!     into_elements is a scalar expression         */
-    /*!     into_elements is an array expression         */
-    /*!     alloc_start is a scalar expression           */
-    /*!     alloc_start is an array expression           */
-    /*!     alloc_elements is a scalar expression        */
-    /*!     alloc_elements is an array expression        */
-    uint32_t array_fields;
-};
-const int flag_align_is_array = 6;
-const int flag_alloc_if_is_array = 7;
-const int flag_free_if_is_array = 8;
-const int flag_extent_start_is_scalar = 9;
-const int flag_extent_start_is_array = 10;
-const int flag_extent_elements_is_scalar = 11;
-const int flag_extent_elements_is_array = 12;
-const int flag_into_start_is_scalar = 13;
-const int flag_into_start_is_array = 14;
-const int flag_into_elements_is_scalar = 15;
-const int flag_into_elements_is_array = 16;
-const int flag_alloc_start_is_scalar = 17;
-const int flag_alloc_start_is_array = 18;
-const int flag_alloc_elements_is_scalar = 19;
-const int flag_alloc_elements_is_array = 20;
-
-// The Marshaller
-class Marshaller
-{
-private:
-    // Start address of buffer
-    char *buffer_start;
-
-    // Current pointer within buffer
-    char *buffer_ptr;
-
-    // Physical size of data sent (including flags)
-    long long buffer_size;
-
-    // User data sent/received
-    long long tfr_size;
-
-public:
-    // Constructor
-    Marshaller() :
-        buffer_start(0), buffer_ptr(0),
-        buffer_size(0), tfr_size(0)
-    {
-    }
-
-    // Return count of user data sent/received
-    long long get_tfr_size() const
-    {
-        return tfr_size;
-    }
-
-    // Return pointer to buffer
-    char *get_buffer_start() const
-    {
-        return buffer_start;
-    }
-
-    // Return current size of data in buffer
-    long long get_buffer_size() const
-    {
-        return buffer_size;
-    }
-
-    // Set buffer pointer
-    void init_buffer(
-        char *d,
-        long long s
-    )
-    {
-        buffer_start = buffer_ptr = d;
-        buffer_size = s;
-    }
-
-    // Send data
-    void send_data(
-        const void *data,
-        int64_t length
-    );
-
-    // Receive data
-    void receive_data(
-        void *data,
-        int64_t length
-    );
-
-    // Send function pointer
-    void send_func_ptr(
-        const void* data
-    );
-
-    // Receive function pointer
-    void receive_func_ptr(
-        const void** data
-    );
-};
-
-// End of the Marshaller
-
-// The offloaded function descriptor.
-// Sent from host to target to specify which function to run.
-// Also, sets console and file tracing levels.
-struct FunctionDescriptor
-{
-    // Input data size.
-    long long in_datalen;
-
-    // Output data size.
-    long long out_datalen;
-
-    // Whether trace is requested on console.
-    // A value of 1 produces only function name and data sent/received.
-    // Values > 1 produce copious trace information.
-    uint8_t console_enabled;
-
-    // Flag controlling timing on the target side.
-    // Values > 0 enable timing on sink.
-    uint8_t timer_enabled;
-
-    int offload_report_level;
-    int offload_number;
-
-    // number of variable descriptors
-    int vars_num;
-
-    // inout data offset if data is passed as misc/return data
-    // otherwise it should be zero.
-    int data_offset;
-
-    // The name of the offloaded function
-    char data[];
-};
-
-// typedef OFFLOAD.
-// Pointer to OffloadDescriptor.
-typedef struct OffloadDescriptor *OFFLOAD;
-
-#endif // OFFLOAD_COMMON_H_INCLUDED
diff --git a/offload/src/offload_engine.cpp b/offload/src/offload_engine.cpp
deleted file mode 100644
index 069b604..0000000
--- a/offload/src/offload_engine.cpp
+++ /dev/null
@@ -1,531 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_engine.h"
-#include <signal.h>
-#include <errno.h>
-
-#include <algorithm>
-#include <vector>
-
-#include "offload_host.h"
-#include "offload_table.h"
-
-const char* Engine::m_func_names[Engine::c_funcs_total] =
-{
-    "server_compute",
-#ifdef MYO_SUPPORT
-    "server_myoinit",
-    "server_myofini",
-#endif // MYO_SUPPORT
-    "server_init",
-    "server_var_table_size",
-    "server_var_table_copy"
-};
-
-// Symbolic representation of system signals. Fix for CQ233593
-const char* Engine::c_signal_names[Engine::c_signal_max] =
-{
-    "Unknown SIGNAL",
-    "SIGHUP",    /*  1, Hangup (POSIX).  */
-    "SIGINT",    /*  2, Interrupt (ANSI).  */
-    "SIGQUIT",   /*  3, Quit (POSIX).  */
-    "SIGILL",    /*  4, Illegal instruction (ANSI).  */
-    "SIGTRAP",   /*  5, Trace trap (POSIX).  */
-    "SIGABRT",   /*  6, Abort (ANSI).  */
-    "SIGBUS",    /*  7, BUS error (4.2 BSD).  */
-    "SIGFPE",    /*  8, Floating-point exception (ANSI).  */
-    "SIGKILL",   /*  9, Kill, unblockable (POSIX).  */
-    "SIGUSR1",   /* 10, User-defined signal 1 (POSIX).  */
-    "SIGSEGV",   /* 11, Segmentation violation (ANSI).  */
-    "SIGUSR2",   /* 12, User-defined signal 2 (POSIX).  */
-    "SIGPIPE",   /* 13, Broken pipe (POSIX).  */
-    "SIGALRM",   /* 14, Alarm clock (POSIX).  */
-    "SIGTERM",   /* 15, Termination (ANSI).  */
-    "SIGSTKFLT", /* 16, Stack fault.  */
-    "SIGCHLD",   /* 17, Child status has changed (POSIX).  */
-    "SIGCONT",   /* 18, Continue (POSIX).  */
-    "SIGSTOP",   /* 19, Stop, unblockable (POSIX).  */
-    "SIGTSTP",   /* 20, Keyboard stop (POSIX).  */
-    "SIGTTIN",   /* 21, Background read from tty (POSIX).  */
-    "SIGTTOU",   /* 22, Background write to tty (POSIX).  */
-    "SIGURG",    /* 23, Urgent condition on socket (4.2 BSD).  */
-    "SIGXCPU",   /* 24, CPU limit exceeded (4.2 BSD).  */
-    "SIGXFSZ",   /* 25, File size limit exceeded (4.2 BSD).  */
-    "SIGVTALRM", /* 26, Virtual alarm clock (4.2 BSD).  */
-    "SIGPROF",   /* 27, Profiling alarm clock (4.2 BSD).  */
-    "SIGWINCH",  /* 28, Window size change (4.3 BSD, Sun).  */
-    "SIGIO",     /* 29, I/O now possible (4.2 BSD).  */
-    "SIGPWR",    /* 30, Power failure restart (System V).  */
-    "SIGSYS"     /* 31, Bad system call.  */
-};
-
-void Engine::init(void)
-{
-    if (!m_ready) {
-        mutex_locker_t locker(m_lock);
-
-        if (!m_ready) {
-            // start process if not done yet
-            if (m_process == 0) {
-                init_process();
-            }
-
-            // load penging images
-            load_libraries();
-
-            // and (re)build pointer table
-            init_ptr_data();
-
-            // it is ready now
-            m_ready = true;
-        }
-    }
-}
-
-void Engine::init_process(void)
-{
-    COIENGINE engine;
-    COIRESULT res;
-    const char **environ;
-
-    // create environment for the target process
-    environ = (const char**) mic_env_vars.create_environ_for_card(m_index);
-    if (environ != 0) {
-        for (const char **p = environ; *p != 0; p++) {
-            OFFLOAD_DEBUG_TRACE(3, "Env Var for card %d: %s\n", m_index, *p);
-        }
-    }
-
-    // Create execution context in the specified device
-    OFFLOAD_DEBUG_TRACE(2, "Getting device %d (engine %d) handle\n", m_index,
-                        m_physical_index);
-    res = COI::EngineGetHandle(COI_ISA_KNC, m_physical_index, &engine);
-    check_result(res, c_get_engine_handle, m_index, res);
-
-    // Target executable should be available by the time when we
-    // attempt to initialize the device
-    if (__target_exe == 0) {
-        LIBOFFLOAD_ERROR(c_no_target_exe);
-        exit(1);
-    }
-
-    OFFLOAD_DEBUG_TRACE(2,
-        "Loading target executable \"%s\" from %p, size %lld\n",
-        __target_exe->name, __target_exe->data, __target_exe->size);
-
-    res = COI::ProcessCreateFromMemory(
-        engine,                 // in_Engine
-        __target_exe->name,     // in_pBinaryName
-        __target_exe->data,     // in_pBinaryBuffer
-        __target_exe->size,     // in_BinaryBufferLength,
-        0,                      // in_Argc
-        0,                      // in_ppArgv
-        environ == 0,           // in_DupEnv
-        environ,                // in_ppAdditionalEnv
-        mic_proxy_io,           // in_ProxyActive
-        mic_proxy_fs_root,      // in_ProxyfsRoot
-        mic_buffer_size,        // in_BufferSpace
-        mic_library_path,       // in_LibrarySearchPath
-        __target_exe->origin,   // in_FileOfOrigin
-        __target_exe->offset,   // in_FileOfOriginOffset
-        &m_process              // out_pProcess
-    );
-    check_result(res, c_process_create, m_index, res);
-
-    // get function handles
-    res = COI::ProcessGetFunctionHandles(m_process, c_funcs_total,
-                                         m_func_names, m_funcs);
-    check_result(res, c_process_get_func_handles, m_index, res);
-
-    // initialize device side
-    pid_t pid = init_device();
-
-    // For IDB
-    if (__dbg_is_attached) {
-        // TODO: we have in-memory executable now.
-        // Check with IDB team what should we provide them now?
-        if (strlen(__target_exe->name) < MAX_TARGET_NAME) {
-            strcpy(__dbg_target_exe_name, __target_exe->name);
-        }
-        __dbg_target_so_pid = pid;
-        __dbg_target_id = m_physical_index;
-        __dbg_target_so_loaded();
-    }
-}
-
-void Engine::fini_process(bool verbose)
-{
-    if (m_process != 0) {
-        uint32_t sig;
-        int8_t ret;
-
-        // destroy target process
-        OFFLOAD_DEBUG_TRACE(2, "Destroying process on the device %d\n",
-                            m_index);
-
-        COIRESULT res = COI::ProcessDestroy(m_process, -1, 0, &ret, &sig);
-        m_process = 0;
-
-        if (res == COI_SUCCESS) {
-            OFFLOAD_DEBUG_TRACE(3, "Device process: signal %d, exit code %d\n",
-                                sig, ret);
-            if (verbose) {
-                if (sig != 0) {
-                    LIBOFFLOAD_ERROR(
-                        c_mic_process_exit_sig, m_index, sig,
-                        c_signal_names[sig >= c_signal_max ? 0 : sig]);
-                }
-                else {
-                    LIBOFFLOAD_ERROR(c_mic_process_exit_ret, m_index, ret);
-                }
-            }
-
-            // for idb
-            if (__dbg_is_attached) {
-                __dbg_target_so_unloaded();
-            }
-        }
-        else {
-            if (verbose) {
-                LIBOFFLOAD_ERROR(c_mic_process_exit, m_index);
-            }
-        }
-    }
-}
-
-void Engine::load_libraries()
-{
-    // load libraries collected so far
-    for (TargetImageList::iterator it = m_images.begin();
-         it != m_images.end(); it++) {
-        OFFLOAD_DEBUG_TRACE(2, "Loading library \"%s\" from %p, size %llu\n",
-                            it->name, it->data, it->size);
-
-        // load library to the device
-        COILIBRARY lib;
-        COIRESULT res;
-        res = COI::ProcessLoadLibraryFromMemory(m_process,
-                                                it->data,
-                                                it->size,
-                                                it->name,
-                                                mic_library_path,
-                                                it->origin,
-                                                it->offset,
-                                                COI_LOADLIBRARY_V1_FLAGS,
-                                                &lib);
-
-        if (res != COI_SUCCESS && res != COI_ALREADY_EXISTS) {
-            check_result(res, c_load_library, m_index, res);
-        }
-    }
-    m_images.clear();
-}
-
-static bool target_entry_cmp(
-    const VarList::BufEntry &l,
-    const VarList::BufEntry &r
-)
-{
-    const char *l_name = reinterpret_cast<const char*>(l.name);
-    const char *r_name = reinterpret_cast<const char*>(r.name);
-    return strcmp(l_name, r_name) < 0;
-}
-
-static bool host_entry_cmp(
-    const VarTable::Entry *l,
-    const VarTable::Entry *r
-)
-{
-    return strcmp(l->name, r->name) < 0;
-}
-
-void Engine::init_ptr_data(void)
-{
-    COIRESULT res;
-    COIEVENT event;
-
-    // Prepare table of host entries
-    std::vector<const VarTable::Entry*> host_table(__offload_vars.begin(),
-                                                   __offload_vars.end());
-
-    // no need to do anything further is host table is empty
-    if (host_table.size() <= 0) {
-        return;
-    }
-
-    // Get var table entries from the target.
-    // First we need to get size for the buffer to copy data
-    struct {
-        int64_t nelems;
-        int64_t length;
-    } params;
-
-    res = COI::PipelineRunFunction(get_pipeline(),
-                                   m_funcs[c_func_var_table_size],
-                                   0, 0, 0,
-                                   0, 0,
-                                   0, 0,
-                                   &params, sizeof(params),
-                                   &event);
-    check_result(res, c_pipeline_run_func, m_index, res);
-
-    res = COI::EventWait(1, &event, -1, 1, 0, 0);
-    check_result(res, c_event_wait, res);
-
-    if (params.length == 0) {
-        return;
-    }
-
-    // create buffer for target entries and copy data to host
-    COIBUFFER buffer;
-    res = COI::BufferCreate(params.length, COI_BUFFER_NORMAL, 0, 0, 1,
-                            &m_process, &buffer);
-    check_result(res, c_buf_create, m_index, res);
-
-    COI_ACCESS_FLAGS flags = COI_SINK_WRITE;
-    res = COI::PipelineRunFunction(get_pipeline(),
-                                   m_funcs[c_func_var_table_copy],
-                                   1, &buffer, &flags,
-                                   0, 0,
-                                   &params.nelems, sizeof(params.nelems),
-                                   0, 0,
-                                   &event);
-    check_result(res, c_pipeline_run_func, m_index, res);
-
-    res = COI::EventWait(1, &event, -1, 1, 0, 0);
-    check_result(res, c_event_wait, res);
-
-    // patch names in target data
-    VarList::BufEntry *target_table;
-    COIMAPINSTANCE map_inst;
-    res = COI::BufferMap(buffer, 0, params.length, COI_MAP_READ_ONLY, 0, 0,
-                         0, &map_inst,
-                         reinterpret_cast<void**>(&target_table));
-    check_result(res, c_buf_map, res);
-
-    VarList::table_patch_names(target_table, params.nelems);
-
-    // and sort entries
-    std::sort(target_table, target_table + params.nelems, target_entry_cmp);
-    std::sort(host_table.begin(), host_table.end(), host_entry_cmp);
-
-    // merge host and target entries and enter matching vars map
-    std::vector<const VarTable::Entry*>::const_iterator hi =
-        host_table.begin();
-    std::vector<const VarTable::Entry*>::const_iterator he =
-        host_table.end();
-    const VarList::BufEntry *ti = target_table;
-    const VarList::BufEntry *te = target_table + params.nelems;
-
-    while (hi != he && ti != te) {
-        int res = strcmp((*hi)->name, reinterpret_cast<const char*>(ti->name));
-        if (res == 0) {
-            // add matching entry to var map
-            std::pair<PtrSet::iterator, bool> res =
-                m_ptr_set.insert(PtrData((*hi)->addr, (*hi)->size));
-
-            // store address for new entries
-            if (res.second) {
-                PtrData *ptr = const_cast<PtrData*>(res.first.operator->());
-                ptr->mic_addr = ti->addr;
-                ptr->is_static = true;
-            }
-
-            hi++;
-            ti++;
-        }
-        else if (res < 0) {
-            hi++;
-        }
-        else {
-            ti++;
-        }
-    }
-
-    // cleanup
-    res = COI::BufferUnmap(map_inst, 0, 0, 0);
-    check_result(res, c_buf_unmap, res);
-
-    res = COI::BufferDestroy(buffer);
-    check_result(res, c_buf_destroy, res);
-}
-
-COIRESULT Engine::compute(
-    const std::list<COIBUFFER> &buffers,
-    const void*         data,
-    uint16_t            data_size,
-    void*               ret,
-    uint16_t            ret_size,
-    uint32_t            num_deps,
-    const COIEVENT*     deps,
-    COIEVENT*           event
-) /* const */
-{
-    COIBUFFER *bufs;
-    COI_ACCESS_FLAGS *flags;
-    COIRESULT res;
-
-    // convert buffers list to array
-    int num_bufs = buffers.size();
-    if (num_bufs > 0) {
-        bufs = (COIBUFFER*) alloca(num_bufs * sizeof(COIBUFFER));
-        flags = (COI_ACCESS_FLAGS*) alloca(num_bufs *
-                                           sizeof(COI_ACCESS_FLAGS));
-
-        int i = 0;
-        for (std::list<COIBUFFER>::const_iterator it = buffers.begin();
-             it != buffers.end(); it++) {
-            bufs[i] = *it;
-
-            // TODO: this should be fixed
-            flags[i++] = COI_SINK_WRITE;
-        }
-    }
-    else {
-        bufs = 0;
-        flags = 0;
-    }
-
-    // start computation
-    res = COI::PipelineRunFunction(get_pipeline(),
-                                   m_funcs[c_func_compute],
-                                   num_bufs, bufs, flags,
-                                   num_deps, deps,
-                                   data, data_size,
-                                   ret, ret_size,
-                                   event);
-    return res;
-}
-
-pid_t Engine::init_device(void)
-{
-    struct init_data {
-        int  device_index;
-        int  devices_total;
-        int  console_level;
-        int  offload_report_level;
-    } data;
-    COIRESULT res;
-    COIEVENT event;
-    pid_t pid;
-
-    OFFLOAD_DEBUG_TRACE_1(2, 0, c_offload_init,
-                          "Initializing device with logical index %d "
-                          "and physical index %d\n",
-                           m_index, m_physical_index);
-
-    // setup misc data
-    data.device_index = m_index;
-    data.devices_total = mic_engines_total;
-    data.console_level = console_enabled;
-    data.offload_report_level = offload_report_level;
-
-    res = COI::PipelineRunFunction(get_pipeline(),
-                                   m_funcs[c_func_init],
-                                   0, 0, 0, 0, 0,
-                                   &data, sizeof(data),
-                                   &pid, sizeof(pid),
-                                   &event);
-    check_result(res, c_pipeline_run_func, m_index, res);
-
-    res = COI::EventWait(1, &event, -1, 1, 0, 0);
-    check_result(res, c_event_wait, res);
-
-    OFFLOAD_DEBUG_TRACE(2, "Device process pid is %d\n", pid);
-
-    return pid;
-}
-
-// data associated with each thread
-struct Thread {
-    Thread(long* addr_coipipe_counter) {
-        m_addr_coipipe_counter = addr_coipipe_counter;
-        memset(m_pipelines, 0, sizeof(m_pipelines));
-    }
-
-    ~Thread() {
-#ifndef TARGET_WINNT
-        __sync_sub_and_fetch(m_addr_coipipe_counter, 1);
-#else // TARGET_WINNT
-        _InterlockedDecrement(m_addr_coipipe_counter);
-#endif // TARGET_WINNT
-        for (int i = 0; i < mic_engines_total; i++) {
-            if (m_pipelines[i] != 0) {
-                COI::PipelineDestroy(m_pipelines[i]);
-            }
-        }
-    }
-
-    COIPIPELINE get_pipeline(int index) const {
-        return m_pipelines[index];
-    }
-
-    void set_pipeline(int index, COIPIPELINE pipeline) {
-        m_pipelines[index] = pipeline;
-    }
-
-    AutoSet& get_auto_vars() {
-        return m_auto_vars;
-    }
-
-private:
-    long*       m_addr_coipipe_counter;
-    AutoSet     m_auto_vars;
-    COIPIPELINE m_pipelines[MIC_ENGINES_MAX];
-};
-
-COIPIPELINE Engine::get_pipeline(void)
-{
-    Thread* thread = (Thread*) thread_getspecific(mic_thread_key);
-    if (thread == 0) {
-        thread = new Thread(&m_proc_number);
-        thread_setspecific(mic_thread_key, thread);
-    }
-
-    COIPIPELINE pipeline = thread->get_pipeline(m_index);
-    if (pipeline == 0) {
-        COIRESULT res;
-        int proc_num;
-
-#ifndef TARGET_WINNT
-        proc_num = __sync_fetch_and_add(&m_proc_number, 1);
-#else // TARGET_WINNT
-        proc_num = _InterlockedIncrement(&m_proc_number);
-#endif // TARGET_WINNT
-
-        if (proc_num > COI_PIPELINE_MAX_PIPELINES) {
-            LIBOFFLOAD_ERROR(c_coipipe_max_number, COI_PIPELINE_MAX_PIPELINES);
-            LIBOFFLOAD_ABORT;
-        }
-        // create pipeline for this thread
-        res = COI::PipelineCreate(m_process, 0, mic_stack_size, &pipeline);
-        check_result(res, c_pipeline_create, m_index, res);
-
-        thread->set_pipeline(m_index, pipeline);
-    }
-    return pipeline;
-}
-
-AutoSet& Engine::get_auto_vars(void)
-{
-    Thread* thread = (Thread*) thread_getspecific(mic_thread_key);
-    if (thread == 0) {
-        thread = new Thread(&m_proc_number);
-        thread_setspecific(mic_thread_key, thread);
-    }
-
-    return thread->get_auto_vars();
-}
-
-void Engine::destroy_thread_data(void *data)
-{
-    delete static_cast<Thread*>(data);
-}
diff --git a/offload/src/offload_engine.h b/offload/src/offload_engine.h
deleted file mode 100644
index d1a9631..0000000
--- a/offload/src/offload_engine.h
+++ /dev/null
@@ -1,482 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef OFFLOAD_ENGINE_H_INCLUDED
-#define OFFLOAD_ENGINE_H_INCLUDED
-
-#include <limits.h>
-
-#include <list>
-#include <set>
-#include <map>
-#include "offload_common.h"
-#include "coi/coi_client.h"
-
-// Address range
-class MemRange {
-public:
-    MemRange() : m_start(0), m_length(0) {}
-    MemRange(const void *addr, uint64_t len) : m_start(addr), m_length(len) {}
-
-    const void* start() const {
-        return m_start;
-    }
-
-    const void* end() const {
-        return static_cast<const char*>(m_start) + m_length;
-    }
-
-    uint64_t length() const {
-        return m_length;
-    }
-
-    // returns true if given range overlaps with another one
-    bool overlaps(const MemRange &o) const {
-        // Two address ranges A[start, end) and B[start,end) overlap
-        // if A.start < B.end and A.end > B.start.
-        return start() < o.end() && end() > o.start();
-    }
-
-    // returns true if given range contains the other range
-    bool contains(const MemRange &o) const {
-        return start() <= o.start() && o.end() <= end();
-    }
-
-private:
-    const void* m_start;
-    uint64_t    m_length;
-};
-
-// Data associated with a pointer variable
-class PtrData {
-public:
-    PtrData(const void *addr, uint64_t len) :
-        cpu_addr(addr, len), cpu_buf(0),
-        mic_addr(0), alloc_disp(0), mic_buf(0), mic_offset(0),
-        ref_count(0), is_static(false)
-    {}
-
-    //
-    // Copy constructor
-    //
-    PtrData(const PtrData& ptr):
-        cpu_addr(ptr.cpu_addr), cpu_buf(ptr.cpu_buf),
-        mic_addr(ptr.mic_addr), alloc_disp(ptr.alloc_disp),
-        mic_buf(ptr.mic_buf), mic_offset(ptr.mic_offset),
-        ref_count(ptr.ref_count), is_static(ptr.is_static)
-    {}
-
-    bool operator<(const PtrData &o) const {
-        // Variables are sorted by the CPU start address.
-        // Overlapping memory ranges are considered equal.
-        return (cpu_addr.start() < o.cpu_addr.start()) &&
-               !cpu_addr.overlaps(o.cpu_addr);
-    }
-
-    long add_reference() {
-        if (is_static) {
-            return LONG_MAX;
-        }
-#ifndef TARGET_WINNT
-        return __sync_fetch_and_add(&ref_count, 1);
-#else // TARGET_WINNT
-        return _InterlockedIncrement(&ref_count) - 1;
-#endif // TARGET_WINNT
-    }
-
-    long remove_reference() {
-        if (is_static) {
-            return LONG_MAX;
-        }
-#ifndef TARGET_WINNT
-        return __sync_sub_and_fetch(&ref_count, 1);
-#else // TARGET_WINNT
-        return _InterlockedDecrement(&ref_count);
-#endif // TARGET_WINNT
-    }
-
-    long get_reference() const {
-        if (is_static) {
-            return LONG_MAX;
-        }
-        return ref_count;
-    }
-
-public:
-    // CPU address range
-    const MemRange  cpu_addr;
-
-    // CPU and MIC buffers
-    COIBUFFER       cpu_buf;
-    COIBUFFER       mic_buf;
-
-    // placeholder for buffer address on mic
-    uint64_t        mic_addr;
-
-    uint64_t        alloc_disp;
-
-    // additional offset to pointer data on MIC for improving bandwidth for
-    // data which is not 4K aligned
-    uint32_t        mic_offset;
-
-    // if true buffers are created from static memory
-    bool            is_static;
-    mutex_t         alloc_ptr_data_lock;
-
-private:
-    // reference count for the entry
-    long            ref_count;
-};
-
-typedef std::list<PtrData*> PtrDataList;
-
-// Data associated with automatic variable
-class AutoData {
-public:
-    AutoData(const void *addr, uint64_t len) :
-        cpu_addr(addr, len), ref_count(0)
-    {}
-
-    bool operator<(const AutoData &o) const {
-        // Variables are sorted by the CPU start address.
-        // Overlapping memory ranges are considered equal.
-        return (cpu_addr.start() < o.cpu_addr.start()) &&
-               !cpu_addr.overlaps(o.cpu_addr);
-    }
-
-    long add_reference() {
-#ifndef TARGET_WINNT
-        return __sync_fetch_and_add(&ref_count, 1);
-#else // TARGET_WINNT
-        return _InterlockedIncrement(&ref_count) - 1;
-#endif // TARGET_WINNT
-    }
-
-    long remove_reference() {
-#ifndef TARGET_WINNT
-        return __sync_sub_and_fetch(&ref_count, 1);
-#else // TARGET_WINNT
-        return _InterlockedDecrement(&ref_count);
-#endif // TARGET_WINNT
-    }
-
-    long get_reference() const {
-        return ref_count;
-    }
-
-public:
-    // CPU address range
-    const MemRange cpu_addr;
-
-private:
-    // reference count for the entry
-    long ref_count;
-};
-
-// Set of autimatic variables
-typedef std::set<AutoData> AutoSet;
-
-// Target image data
-struct TargetImage
-{
-    TargetImage(const char *_name, const void *_data, uint64_t _size,
-                const char *_origin, uint64_t _offset) :
-        name(_name), data(_data), size(_size),
-        origin(_origin), offset(_offset)
-    {}
-
-    // library name
-    const char* name;
-
-    // contents and size
-    const void* data;
-    uint64_t    size;
-
-    // file of origin and offset within that file
-    const char* origin;
-    uint64_t    offset;
-};
-
-typedef std::list<TargetImage> TargetImageList;
-
-// Data associated with persistent auto objects
-struct PersistData
-{
-    PersistData(const void *addr, uint64_t routine_num, uint64_t size) :
-        stack_cpu_addr(addr), routine_id(routine_num)
-    {
-        stack_ptr_data = new PtrData(0, size);
-    }
-    // 1-st key value - beginning of the stack at CPU
-    const void *   stack_cpu_addr;
-    // 2-nd key value - identifier of routine invocation at CPU
-    uint64_t   routine_id;
-    // corresponded PtrData; only stack_ptr_data->mic_buf is used
-    PtrData * stack_ptr_data;
-    // used to get offset of the variable in stack buffer
-    char * cpu_stack_addr;
-};
-
-typedef std::list<PersistData> PersistDataList;
-
-// class representing a single engine
-struct Engine {
-    friend void __offload_init_library_once(void);
-    friend void __offload_fini_library(void);
-
-#define check_result(res, tag, ...) \
-    { \
-        if (res == COI_PROCESS_DIED) { \
-            fini_process(true); \
-            exit(1); \
-        } \
-        if (res != COI_SUCCESS) { \
-            __liboffload_error_support(tag, __VA_ARGS__); \
-            exit(1); \
-        } \
-    }
-
-    int get_logical_index() const {
-        return m_index;
-    }
-
-    int get_physical_index() const {
-        return m_physical_index;
-    }
-
-    const COIPROCESS& get_process() const {
-        return m_process;
-    }
-
-    // initialize device
-    void init(void);
-
-    // add new library
-    void add_lib(const TargetImage &lib)
-    {
-        m_lock.lock();
-        m_ready = false;
-        m_images.push_back(lib);
-        m_lock.unlock();
-    }
-
-    COIRESULT compute(
-        const std::list<COIBUFFER> &buffers,
-        const void*         data,
-        uint16_t            data_size,
-        void*               ret,
-        uint16_t            ret_size,
-        uint32_t            num_deps,
-        const COIEVENT*     deps,
-        COIEVENT*           event
-    );
-
-#ifdef MYO_SUPPORT
-    // temporary workaround for blocking behavior for myoiLibInit/Fini calls
-    void init_myo(COIEVENT *event) {
-        COIRESULT res;
-        res = COI::PipelineRunFunction(get_pipeline(),
-                                       m_funcs[c_func_myo_init],
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       event);
-        check_result(res, c_pipeline_run_func, m_index, res);
-    }
-
-    void fini_myo(COIEVENT *event) {
-        COIRESULT res;
-        res = COI::PipelineRunFunction(get_pipeline(),
-                                       m_funcs[c_func_myo_fini],
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       event);
-        check_result(res, c_pipeline_run_func, m_index, res);
-    }
-#endif // MYO_SUPPORT
-
-    //
-    // Memory association table
-    //
-    PtrData* find_ptr_data(const void *ptr) {
-        m_ptr_lock.lock();
-        PtrSet::iterator res = m_ptr_set.find(PtrData(ptr, 0));
-        m_ptr_lock.unlock();
-        if (res == m_ptr_set.end()) {
-            return 0;
-        }
-        return const_cast<PtrData*>(res.operator->());
-    }
-
-    PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) {
-        m_ptr_lock.lock();
-        std::pair<PtrSet::iterator, bool> res =
-            m_ptr_set.insert(PtrData(ptr, len));
-        PtrData* ptr_data = const_cast<PtrData*>(res.first.operator->());
-        m_ptr_lock.unlock();
-
-        is_new = res.second;
-        if (is_new) {
-            // It's necessary to lock as soon as possible.
-            // unlock must be done at call site of insert_ptr_data at
-            // branch for is_new
-            ptr_data->alloc_ptr_data_lock.lock();
-        }
-        return ptr_data;
-    }
-
-    void remove_ptr_data(const void *ptr) {
-        m_ptr_lock.lock();
-        m_ptr_set.erase(PtrData(ptr, 0));
-        m_ptr_lock.unlock();
-    }
-
-    //
-    // Automatic variables
-    //
-    AutoData* find_auto_data(const void *ptr) {
-        AutoSet &auto_vars = get_auto_vars();
-        AutoSet::iterator res = auto_vars.find(AutoData(ptr, 0));
-        if (res == auto_vars.end()) {
-            return 0;
-        }
-        return const_cast<AutoData*>(res.operator->());
-    }
-
-    AutoData* insert_auto_data(const void *ptr, uint64_t len) {
-        AutoSet &auto_vars = get_auto_vars();
-        std::pair<AutoSet::iterator, bool> res =
-            auto_vars.insert(AutoData(ptr, len));
-        return const_cast<AutoData*>(res.first.operator->());
-    }
-
-    void remove_auto_data(const void *ptr) {
-        get_auto_vars().erase(AutoData(ptr, 0));
-    }
-
-    //
-    // Signals
-    //
-    void add_signal(const void *signal, OffloadDescriptor *desc) {
-        m_signal_lock.lock();
-        m_signal_map[signal] = desc;
-        m_signal_lock.unlock();
-    }
-
-    OffloadDescriptor* find_signal(const void *signal, bool remove) {
-        OffloadDescriptor *desc = 0;
-
-        m_signal_lock.lock();
-        {
-            SignalMap::iterator it = m_signal_map.find(signal);
-            if (it != m_signal_map.end()) {
-                desc = it->second;
-                if (remove) {
-                    m_signal_map.erase(it);
-                }
-            }
-        }
-        m_signal_lock.unlock();
-
-        return desc;
-    }
-
-    // stop device process
-    void fini_process(bool verbose);
-
-    // list of stacks active at the engine
-    PersistDataList m_persist_list;
-
-private:
-    Engine() : m_index(-1), m_physical_index(-1), m_process(0), m_ready(false),
-               m_proc_number(0)
-    {}
-
-    ~Engine() {
-        if (m_process != 0) {
-            fini_process(false);
-        }
-    }
-
-    // set indexes
-    void set_indexes(int logical_index, int physical_index) {
-        m_index = logical_index;
-        m_physical_index = physical_index;
-    }
-
-    // start process on device
-    void init_process();
-
-    void load_libraries(void);
-    void init_ptr_data(void);
-
-    // performs library intialization on the device side
-    pid_t init_device(void);
-
-private:
-    // get pipeline associated with a calling thread
-    COIPIPELINE get_pipeline(void);
-
-    // get automatic vars set associated with the calling thread
-    AutoSet& get_auto_vars(void);
-
-    // destructor for thread data
-    static void destroy_thread_data(void *data);
-
-private:
-    typedef std::set<PtrData> PtrSet;
-    typedef std::map<const void*, OffloadDescriptor*> SignalMap;
-
-    // device indexes
-    int         m_index;
-    int         m_physical_index;
-
-    // number of COI pipes created for the engine
-    long        m_proc_number;
-
-    // process handle
-    COIPROCESS  m_process;
-
-    // If false, device either has not been initialized or new libraries
-    // have been added.
-    bool        m_ready;
-    mutex_t     m_lock;
-
-    // List of libraries to be loaded
-    TargetImageList m_images;
-
-    // var table
-    PtrSet      m_ptr_set;
-    mutex_t     m_ptr_lock;
-
-    // signals
-    SignalMap m_signal_map;
-    mutex_t   m_signal_lock;
-
-    // constants for accessing device function handles
-    enum {
-        c_func_compute = 0,
-#ifdef MYO_SUPPORT
-        c_func_myo_init,
-        c_func_myo_fini,
-#endif // MYO_SUPPORT
-        c_func_init,
-        c_func_var_table_size,
-        c_func_var_table_copy,
-        c_funcs_total
-    };
-    static const char* m_func_names[c_funcs_total];
-
-    // device function handles
-    COIFUNCTION m_funcs[c_funcs_total];
-
-    // int -> name mapping for device signals
-    static const int   c_signal_max = 32;
-    static const char* c_signal_names[c_signal_max];
-};
-
-#endif // OFFLOAD_ENGINE_H_INCLUDED
diff --git a/offload/src/offload_env.cpp b/offload/src/offload_env.cpp
deleted file mode 100644
index d037338..0000000
--- a/offload/src/offload_env.cpp
+++ /dev/null
@@ -1,354 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_env.h"
-#include <string.h>
-#include <ctype.h>
-#include "offload_util.h"
-#include "liboffload_error_codes.h"
-
-// for environment variables valid on all cards
-const int MicEnvVar::any_card = -1;
-
-MicEnvVar::~MicEnvVar()
-{
-    for (std::list<MicEnvVar::CardEnvVars*>::const_iterator
-         it = card_spec_list.begin();
-         it != card_spec_list.end(); it++) {
-        CardEnvVars *card_data = *it;
-        delete card_data;
-    }
-}
-
-MicEnvVar::VarValue::~VarValue()
-{
-    free(env_var_value);
-}
-
-MicEnvVar::CardEnvVars::~CardEnvVars()
-{
-    for (std::list<MicEnvVar::VarValue*>::const_iterator it = env_vars.begin();
-        it != env_vars.end(); it++) {
-            VarValue *var_value = *it;
-            delete var_value;
-    }
-}
-
-// Searching for card in "card_spec_list" list with the same "number"
-
-MicEnvVar::CardEnvVars* MicEnvVar::get_card(int number)
-{
-    if (number == any_card) {
-        return &common_vars;
-    }
-    for (std::list<MicEnvVar::CardEnvVars*>::const_iterator
-         it = card_spec_list.begin();
-         it != card_spec_list.end(); it++) {
-        CardEnvVars *card_data = *it;
-        if (card_data->card_number == number) {
-            return card_data;
-        }
-    }
-    return NULL;
-}
-
-// Searching for environment variable in "env_var" list with the same name
-
-MicEnvVar::VarValue* MicEnvVar::CardEnvVars::find_var(
-    char* env_var_name,
-    int env_var_name_length
-)
-{
-    for (std::list<MicEnvVar::VarValue*>::const_iterator it = env_vars.begin();
-        it != env_vars.end(); it++) {
-            VarValue *var_value = *it;
-            if (var_value->length == env_var_name_length &&
-                !strncmp(var_value->env_var, env_var_name,
-                         env_var_name_length)) {
-                return var_value;
-            }
-    }
-    return NULL;
-}
-
-void MicEnvVar::analyze_env_var(char *env_var_string)
-{
-    char          *env_var_name;
-    char          *env_var_def;
-    int           card_number;
-    int           env_var_name_length;
-    MicEnvVarKind env_var_kind;
-
-    env_var_kind = get_env_var_kind(env_var_string,
-                                    &card_number,
-                                    &env_var_name,
-                                    &env_var_name_length,
-                                    &env_var_def);
-    switch (env_var_kind) {
-        case c_mic_var:
-        case c_mic_card_var:
-            add_env_var(card_number,
-                        env_var_name,
-                        env_var_name_length,
-                        env_var_def);
-            break;
-        case c_mic_card_env:
-            mic_parse_env_var_list(card_number, env_var_def);
-            break;
-        case c_no_mic:
-        default:
-            break;
-    }
-}
-
-void MicEnvVar::add_env_var(
-    int card_number,
-    char *env_var_name,
-    int env_var_name_length,
-    char *env_var_def
-)
-{
-    VarValue *var;
-    CardEnvVars *card;
-
-    // The case corresponds to common env var definition of kind
-    // <mic-prefix>_<var>
-    if (card_number == any_card) {
-        card = &common_vars;
-    }
-    else {
-        card = get_card(card_number);
-        if (!card) {
-            // definition for new card occurred
-            card = new CardEnvVars(card_number);
-            card_spec_list.push_back(card);
-        }
-
-    }
-    var = card->find_var(env_var_name, env_var_name_length);
-    if (!var) {
-        // put new env var definition in "env_var" list
-        var = new VarValue(env_var_name, env_var_name_length, env_var_def);
-        card->env_vars.push_back(var);
-    }
-}
-
-// The routine analyses string pointed by "env_var_string" argument
-// according to the following syntax:
-//
-// Specification of prefix for MIC environment variables
-// MIC_ENV_PREFIX=<mic-prefix>
-//
-// Setting single MIC environment variable
-// <mic-prefix>_<var>=<value>
-// <mic-prefix>_<card-number>_<var>=<value>
-
-// Setting multiple MIC environment variables
-// <mic-prefix>_<card-number>_ENV=<env-vars>
-
-MicEnvVarKind MicEnvVar::get_env_var_kind(
-    char *env_var_string,
-    int *card_number,
-    char **env_var_name,
-    int *env_var_name_length,
-    char **env_var_def
-)
-{
-    int len = strlen(prefix);
-    char *c = env_var_string;
-    int num = 0;
-    bool card_is_set = false;
-
-    if (strncmp(c, prefix, len) != 0 || c[len] != '_') {
-            return c_no_mic;
-    }
-    c += len + 1;
-
-    *card_number = any_card;
-    if (isdigit(*c)) {
-        while (isdigit (*c)) {
-            num = (*c++ - '0') + (num * 10);
-        }
-    if (*c != '_') {
-        return c_no_mic;
-    }
-    c++;
-        *card_number = num;
-        card_is_set = true;
-    }
-    if (!isalpha(*c)) {
-        return c_no_mic;
-    }
-    *env_var_name = *env_var_def = c;
-    if (strncmp(c, "ENV=", 4) == 0) {
-        if (!card_is_set) {
-            *env_var_name_length = 3;
-            *env_var_name = *env_var_def = c;
-            *env_var_def = strdup(*env_var_def);
-            return  c_mic_var;
-        }
-        *env_var_def = c + strlen("ENV=");
-        *env_var_def = strdup(*env_var_def);
-        return c_mic_card_env;
-    }
-    if (isalpha(*c)) {
-        *env_var_name_length = 0;
-        while (isalnum(*c) || *c == '_') {
-            c++;
-            (*env_var_name_length)++;
-        }
-    }
-    if (*c != '=') {
-        return c_no_mic;
-    }
-    *env_var_def = strdup(*env_var_def);
-    return card_is_set? c_mic_card_var : c_mic_var;
-}
-
-// analysing <env-vars> in form:
-// <mic-prefix>_<card-number>_ENV=<env-vars>
-// where:
-//
-// <env-vars>:
-//                <env-var>
-//                <env-vars> | <env-var>
-//
-// <env-var>:
-//                variable=value
-//                variable="value"
-//                variable=
-
-void MicEnvVar::mic_parse_env_var_list(
-    int card_number, char *env_vars_def_list)
-{
-    char *c = env_vars_def_list;
-    char *env_var_name;
-    int  env_var_name_length;
-    char *env_var_def;
-    bool var_is_quoted;
-
-    if (*c == '"') {
-        c++;
-    }
-    while (*c != 0) {
-        var_is_quoted = false;
-        env_var_name = c;
-        env_var_name_length = 0;
-        if (isalpha(*c)) {
-            while (isalnum(*c) || *c == '_') {
-                c++;
-                env_var_name_length++;
-            }
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_mic_parse_env_var_list1);
-            return;
-        }
-        if (*c != '=') {
-            LIBOFFLOAD_ERROR(c_mic_parse_env_var_list2);
-            return;
-        }
-        c++;
-
-        if (*c == '"') {
-            var_is_quoted = true;
-            c++;
-        }
-        // Environment variable values that contain | will need to be escaped.
-        while (*c != 0 && *c != '|' &&
-               (!var_is_quoted || *c != '"'))
-        {
-            // skip escaped symbol
-            if (*c == '\\') {
-                c++;
-            }
-            c++;
-        }
-        if (var_is_quoted) {
-            c++; // for "
-            while (*c != 0 && *c != '|') {
-                c++;
-            }
-        }
-
-        int sz = c - env_var_name;
-        env_var_def = (char*)malloc(sz);
-        memcpy(env_var_def, env_var_name, sz);
-        env_var_def[sz] = 0;
-
-        if (*c == '|') {
-            c++;
-            while (*c != 0 && *c == ' ') {
-                c++;
-            }
-        }
-        add_env_var(card_number,
-                    env_var_name,
-                    env_var_name_length,
-                    env_var_def);
-    }
-}
-
-// Collect all definitions for the card with number "card_num".
-// The returned result is vector of string pointers defining one
-// environment variable. The vector is terminated by NULL pointer.
-// In the beginning of the vector there are env vars defined as
-// <mic-prefix>_<card-number>_<var>=<value>
-// or
-// <mic-prefix>_<card-number>_ENV=<env-vars>
-// where <card-number> is equal to "card_num"
-// They are followed by definitions valid for any card
-// and absent in previous definitions.
-
-char** MicEnvVar::create_environ_for_card(int card_num)
-{
-    VarValue *var_value;
-    VarValue *var_value_find;
-    CardEnvVars *card_data = get_card(card_num);
-    CardEnvVars *card_data_common;
-    std::list<char*> new_env;
-    char **rez;
-
-    if (!prefix) {
-        return NULL;
-    }
-    // There is no personel env var definitions for the card with
-    // number "card_num"
-    if (!card_data) {
-        return create_environ_for_card(any_card);
-    }
-
-    for (std::list<MicEnvVar::VarValue*>::const_iterator
-         it = card_data->env_vars.begin();
-         it != card_data->env_vars.end(); it++) {
-        var_value = *it;
-        new_env.push_back(var_value->env_var_value);
-    }
-
-    if (card_num != any_card) {
-        card_data_common = get_card(any_card);
-        for (std::list<MicEnvVar::VarValue*>::const_iterator
-             it = card_data_common->env_vars.begin();
-             it != card_data_common->env_vars.end(); it++) {
-            var_value = *it;
-            var_value_find = card_data->find_var(var_value->env_var,
-                                                 var_value->length);
-            if (!var_value_find) {
-                new_env.push_back(var_value->env_var_value);
-            }
-        }
-    }
-
-    int new_env_size = new_env.size();
-    rez = (char**) malloc((new_env_size + 1) * sizeof(char*));
-    std::copy(new_env.begin(), new_env.end(), rez);
-    rez[new_env_size] = 0;
-    return rez;
-}
diff --git a/offload/src/offload_env.h b/offload/src/offload_env.h
deleted file mode 100644
index f035ff6..0000000
--- a/offload/src/offload_env.h
+++ /dev/null
@@ -1,91 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef OFFLOAD_ENV_H_INCLUDED
-#define OFFLOAD_ENV_H_INCLUDED
-
-#include <list>
-
-// data structure and routines to parse MIC user environment and pass to MIC
-
-enum MicEnvVarKind
-{
-    c_no_mic,         // not MIC env var
-    c_mic_var,        // for <mic-prefix>_<var>
-    c_mic_card_var,   // for <mic-prefix>_<card-number>_<var>
-    c_mic_card_env    // for <mic-prefix>_<card-number>_ENV
-};
-
-struct MicEnvVar {
-public:
-    MicEnvVar() : prefix(0) {}
-    ~MicEnvVar();
-
-    void analyze_env_var(char *env_var_string);
-    char** create_environ_for_card(int card_num);
-    MicEnvVarKind get_env_var_kind(
-        char *env_var_string,
-        int *card_number,
-        char **env_var_name,
-        int *env_var_name_length,
-        char **env_var_def
-    );
-    void add_env_var(
-        int card_number,
-        char *env_var_name,
-        int env_var_name_length,
-        char *env_var_def
-    );
-
-    void set_prefix(const char *pref) {
-        prefix = (pref && *pref != '\0') ? pref : 0;
-    }
-
-    struct VarValue {
-    public:
-        char* env_var;
-        int   length;
-        char* env_var_value;
-
-        VarValue(char* var, int ln, char* value)
-        {
-            env_var = var;
-            length = ln;
-            env_var_value = value;
-        }
-        ~VarValue();
-    };
-
-    struct CardEnvVars {
-    public:
-
-        int card_number;
-        std::list<struct VarValue*> env_vars;
-
-        CardEnvVars() { card_number = any_card; }
-        CardEnvVars(int num) { card_number = num; }
-        ~CardEnvVars();
-
-        void add_new_env_var(int number, char *env_var, int length,
-                             char *env_var_value);
-        VarValue* find_var(char* env_var_name, int env_var_name_length);
-    };
-    static const int any_card;
-
-private:
-    void         mic_parse_env_var_list(int card_number, char *env_var_def);
-    CardEnvVars* get_card(int number);
-
-    const char *prefix;
-    std::list<struct CardEnvVars *> card_spec_list;
-    CardEnvVars common_vars;
-};
-
-#endif // OFFLOAD_ENV_H_INCLUDED
diff --git a/offload/src/offload_host.cpp b/offload/src/offload_host.cpp
deleted file mode 100644
index 38d5139..0000000
--- a/offload/src/offload_host.cpp
+++ /dev/null
@@ -1,4360 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-// Forward declaration as the following 2 functions are declared as friend in offload_engine.h
-// CLANG does not like static to been after friend declaration.
-static void __offload_init_library_once(void);
-static void __offload_fini_library(void);
-
-#include "offload_host.h"
-#ifdef MYO_SUPPORT
-#include "offload_myo_host.h"
-#endif
-
-#include <malloc.h>
-#ifndef TARGET_WINNT
-#include <alloca.h>
-#include <elf.h>
-#endif // TARGET_WINNT
-#include <errno.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <algorithm>
-#include <bitset>
-
-#if defined(HOST_WINNT)
-#define PATH_SEPARATOR ";"
-#else
-#define PATH_SEPARATOR ":"
-#endif
-
-#define GET_OFFLOAD_NUMBER(timer_data) \
-    timer_data? timer_data->offload_number : 0
-
-#ifdef TARGET_WINNT
-// Small subset of ELF declarations for Windows which is needed to compile
-// this file. ELF header is used to understand what binary type is contained
-// in the target image - shared library or executable.
-
-typedef uint16_t Elf64_Half;
-typedef uint32_t Elf64_Word;
-typedef uint64_t Elf64_Addr;
-typedef uint64_t Elf64_Off;
-
-#define EI_NIDENT   16
-
-#define ET_EXEC     2
-#define ET_DYN      3
-
-typedef struct
-{
-    unsigned char e_ident[EI_NIDENT];
-    Elf64_Half    e_type;
-    Elf64_Half    e_machine;
-    Elf64_Word    e_version;
-    Elf64_Addr    e_entry;
-    Elf64_Off     e_phoff;
-    Elf64_Off     e_shoff;
-    Elf64_Word    e_flags;
-    Elf64_Half    e_ehsize;
-    Elf64_Half    e_phentsize;
-    Elf64_Half    e_phnum;
-    Elf64_Half    e_shentsize;
-    Elf64_Half    e_shnum;
-    Elf64_Half    e_shstrndx;
-} Elf64_Ehdr;
-#endif // TARGET_WINNT
-
-// Host console and file logging
-const char *prefix;
-int console_enabled = 0;
-int offload_number = 0;
-
-static const char *htrace_envname = "H_TRACE";
-static const char *offload_report_envname = "OFFLOAD_REPORT";
-static char *timer_envname = "H_TIME";
-
-// Trace information
-static const char* vardesc_direction_as_string[] = {
-    "NOCOPY",
-    "IN",
-    "OUT",
-    "INOUT"
-};
-static const char* vardesc_type_as_string[] = {
-    "unknown",
-    "data",
-    "data_ptr",
-    "func_ptr",
-    "void_ptr",
-    "string_ptr",
-    "dv",
-    "dv_data",
-    "dv_data_slice",
-    "dv_ptr",
-    "dv_ptr_data",
-    "dv_ptr_data_slice",
-    "cean_var",
-    "cean_var_ptr",
-    "c_data_ptr_array",
-    "c_func_ptr_array",
-    "c_void_ptr_array",
-    "c_string_ptr_array"
-};
-
-Engine*         mic_engines = 0;
-uint32_t        mic_engines_total = 0;
-pthread_key_t   mic_thread_key;
-MicEnvVar       mic_env_vars;
-uint64_t        cpu_frequency = 0;
-
-// MIC_STACKSIZE
-uint32_t mic_stack_size = 12 * 1024 * 1024;
-
-// MIC_BUFFERSIZE
-uint64_t mic_buffer_size = 0;
-
-// MIC_LD_LIBRARY_PATH
-char* mic_library_path = 0;
-
-// MIC_PROXY_IO
-bool mic_proxy_io = true;
-
-// MIC_PROXY_FS_ROOT
-char* mic_proxy_fs_root = 0;
-
-// Threshold for creating buffers with large pages. Buffer is created
-// with large pages hint if its size exceeds the threshold value.
-// By default large pages are disabled right now (by setting default
-// value for threshold to MAX) due to HSD 4114629.
-uint64_t __offload_use_2mb_buffers = 0xffffffffffffffffULL;
-static const char *mic_use_2mb_buffers_envname  =
-    "MIC_USE_2MB_BUFFERS";
-
-static uint64_t __offload_use_async_buffer_write = 2 * 1024 * 1024;
-static const char *mic_use_async_buffer_write_envname  =
-    "MIC_USE_ASYNC_BUFFER_WRITE";
-
-static uint64_t __offload_use_async_buffer_read = 2 * 1024 * 1024;
-static const char *mic_use_async_buffer_read_envname  =
-    "MIC_USE_ASYNC_BUFFER_READ";
-
-// device initialization type
-OffloadInitType __offload_init_type = c_init_on_offload_all;
-static const char *offload_init_envname = "OFFLOAD_INIT";
-
-// active wait
-static bool __offload_active_wait = true;
-static const char *offload_active_wait_envname = "OFFLOAD_ACTIVE_WAIT";
-
-// OMP_DEFAULT_DEVICE
-int __omp_device_num = 0;
-static const char *omp_device_num_envname = "OMP_DEFAULT_DEVICE";
-
-// The list of pending target libraries
-static bool            __target_libs;
-static TargetImageList __target_libs_list;
-static mutex_t         __target_libs_lock;
-static mutex_t         stack_alloc_lock;
-
-// Target executable
-TargetImage*           __target_exe;
-
-static char * offload_get_src_base(void * ptr, uint8_t type)
-{
-    char *base;
-    if (VAR_TYPE_IS_PTR(type)) {
-        base = *static_cast<char**>(ptr);
-    }
-    else if (VAR_TYPE_IS_SCALAR(type)) {
-        base = static_cast<char*>(ptr);
-    }
-    else if (VAR_TYPE_IS_DV_DATA_SLICE(type) || VAR_TYPE_IS_DV_DATA(type)) {
-        ArrDesc *dvp;
-        if (VAR_TYPE_IS_DV_DATA_SLICE(type)) {
-            const arr_desc *ap = static_cast<const arr_desc*>(ptr);
-            dvp = (type == c_dv_data_slice) ?
-                  reinterpret_cast<ArrDesc*>(ap->base) :
-                  *reinterpret_cast<ArrDesc**>(ap->base);
-        }
-        else {
-            dvp = (type == c_dv_data) ?
-                  static_cast<ArrDesc*>(ptr) :
-                  *static_cast<ArrDesc**>(ptr);
-        }
-        base = reinterpret_cast<char*>(dvp->Base);
-    }
-    else {
-        base = NULL;
-    }
-    return base;
-}
-
-void OffloadDescriptor::report_coi_error(error_types msg, COIRESULT res)
-{
-    // special case for the 'process died' error
-    if (res == COI_PROCESS_DIED) {
-        m_device.fini_process(true);
-    }
-    else {
-        switch (msg) {
-            case c_buf_create:
-                if (res == COI_OUT_OF_MEMORY) {
-                    msg = c_buf_create_out_of_mem;
-                }
-                /* fallthru */
-
-            case c_buf_create_from_mem:
-            case c_buf_get_address:
-            case c_pipeline_create:
-            case c_pipeline_run_func:
-                LIBOFFLOAD_ERROR(msg, m_device.get_logical_index(), res);
-                break;
-
-            case c_buf_read:
-            case c_buf_write:
-            case c_buf_copy:
-            case c_buf_map:
-            case c_buf_unmap:
-            case c_buf_destroy:
-            case c_buf_set_state:
-                LIBOFFLOAD_ERROR(msg, res);
-                break;
-
-            default:
-                break;
-        }
-    }
-
-    exit(1);
-}
-
-_Offload_result OffloadDescriptor::translate_coi_error(COIRESULT res) const
-{
-    switch (res) {
-        case COI_SUCCESS:
-            return OFFLOAD_SUCCESS;
-
-        case COI_PROCESS_DIED:
-            return OFFLOAD_PROCESS_DIED;
-
-        case COI_OUT_OF_MEMORY:
-            return OFFLOAD_OUT_OF_MEMORY;
-
-        default:
-            return OFFLOAD_ERROR;
-    }
-}
-
-bool OffloadDescriptor::alloc_ptr_data(
-    PtrData* &ptr_data,
-    void *base,
-    int64_t disp,
-    int64_t size,
-    int64_t alloc_disp,
-    int align
-)
-{
-    // total length of base
-    int64_t length = disp + size;
-    bool is_new;
-
-    OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n",
-                  base, length);
-
-    // add new entry
-    ptr_data = m_device.insert_ptr_data(base, length, is_new);
-    if (is_new) {
-
-        OFFLOAD_TRACE(3, "Added new association\n");
-
-        if (length > 0) {
-            OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
-            COIRESULT res;
-
-            // align should be a power of 2
-            if (align > 0 && (align & (align - 1)) == 0) {
-                // offset within mic_buffer. Can do offset optimization
-                // only when source address alignment satisfies requested
-                // alignment on the target (cq172736).
-                if ((reinterpret_cast<intptr_t>(base) & (align - 1)) == 0) {
-                    ptr_data->mic_offset = reinterpret_cast<intptr_t>(base) & 4095;
-                }
-            }
-
-            // buffer size and flags
-            uint64_t buffer_size = length + ptr_data->mic_offset;
-            uint32_t buffer_flags = 0;
-
-            // create buffer with large pages if data length exceeds
-            // large page threshold
-            if (length >= __offload_use_2mb_buffers) {
-                buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
-            }
-
-            // create CPU buffer
-            OFFLOAD_DEBUG_TRACE_1(3,
-                          GET_OFFLOAD_NUMBER(get_timer_data()),
-                          c_offload_create_buf_host,
-                          "Creating buffer from source memory %p, "
-                          "length %lld\n", base, length);
-
-            // result is not checked because we can continue without cpu
-            // buffer. In this case we will use COIBufferRead/Write instead
-            // of COIBufferCopy.
-            COI::BufferCreateFromMemory(length,
-                                        COI_BUFFER_NORMAL,
-                                        0,
-                                        base,
-                                        1,
-                                        &m_device.get_process(),
-                                        &ptr_data->cpu_buf);
-
-            OFFLOAD_DEBUG_TRACE_1(3,
-                          GET_OFFLOAD_NUMBER(get_timer_data()),
-                          c_offload_create_buf_mic,
-                          "Creating buffer for sink: size %lld, offset %d, "
-                          "flags =0x%x\n", buffer_size - alloc_disp,
-                          ptr_data->mic_offset, buffer_flags);
-
-            // create MIC buffer
-            res = COI::BufferCreate(buffer_size - alloc_disp,
-                                    COI_BUFFER_NORMAL,
-                                    buffer_flags,
-                                    0,
-                                    1,
-                                    &m_device.get_process(),
-                                    &ptr_data->mic_buf);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                }
-                else if (m_is_mandatory) {
-                    report_coi_error(c_buf_create, res);
-                }
-                ptr_data->alloc_ptr_data_lock.unlock();
-                return false;
-            }
-
-            // make buffer valid on the device.
-            res = COI::BufferSetState(ptr_data->mic_buf,
-                                      m_device.get_process(),
-                                      COI_BUFFER_VALID,
-                                      COI_BUFFER_NO_MOVE,
-                                      0, 0, 0);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                }
-                else if (m_is_mandatory) {
-                    report_coi_error(c_buf_set_state, res);
-                }
-                ptr_data->alloc_ptr_data_lock.unlock();
-                return false;
-            }
-
-            res = COI::BufferSetState(ptr_data->mic_buf,
-                                      COI_PROCESS_SOURCE,
-                                      COI_BUFFER_INVALID,
-                                      COI_BUFFER_NO_MOVE,
-                                      0, 0, 0);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                }
-                else if (m_is_mandatory) {
-                    report_coi_error(c_buf_set_state, res);
-                }
-                ptr_data->alloc_ptr_data_lock.unlock();
-                return false;
-            }
-        }
-
-        ptr_data->alloc_disp = alloc_disp;
-        ptr_data->alloc_ptr_data_lock.unlock();
-    }
-    else {
-        mutex_locker_t locker(ptr_data->alloc_ptr_data_lock);
-
-        OFFLOAD_TRACE(3, "Found existing association: addr %p, length %lld, "
-                      "is_static %d\n",
-                      ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
-                      ptr_data->is_static);
-
-        // This is not a new entry. Make sure that provided address range fits
-        // into existing one.
-        MemRange addr_range(base, length - ptr_data->alloc_disp);
-        if (!ptr_data->cpu_addr.contains(addr_range)) {
-            LIBOFFLOAD_ERROR(c_bad_ptr_mem_range);
-            exit(1);
-        }
-
-        // if the entry is associated with static data it may not have buffers
-        // created because they are created on demand.
-        if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-bool OffloadDescriptor::find_ptr_data(
-    PtrData* &ptr_data,
-    void *base,
-    int64_t disp,
-    int64_t size,
-    bool report_error
-)
-{
-    // total length of base
-    int64_t length = disp + size;
-
-    OFFLOAD_TRACE(3, "Looking for association for data: addr %p, "
-                  "length %lld\n", base, length);
-
-    // find existing association in pointer table
-    ptr_data = m_device.find_ptr_data(base);
-    if (ptr_data == 0) {
-        if (report_error) {
-            LIBOFFLOAD_ERROR(c_no_ptr_data, base);
-            exit(1);
-        }
-        OFFLOAD_TRACE(3, "Association does not exist\n");
-        return true;
-    }
-
-    OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
-                  ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
-                  ptr_data->is_static);
-
-    // make sure that provided address range fits into existing one
-    MemRange addr_range(base, length);
-    if (!ptr_data->cpu_addr.contains(addr_range)) {
-        if (report_error) {
-            LIBOFFLOAD_ERROR(c_bad_ptr_mem_range);
-            exit(1);
-        }
-        OFFLOAD_TRACE(3, "Existing association partially overlaps with "
-                      "data address range\n");
-        ptr_data = 0;
-        return true;
-    }
-
-    // if the entry is associated with static data it may not have buffers
-    // created because they are created on demand.
-    if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
-        return false;
-    }
-
-    return true;
-}
-
-bool OffloadDescriptor::init_static_ptr_data(PtrData *ptr_data)
-{
-    OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
-
-    if (ptr_data->cpu_buf == 0) {
-        OFFLOAD_TRACE(3, "Creating buffer from source memory %llx\n",
-                      ptr_data->cpu_addr.start());
-
-        COIRESULT res = COI::BufferCreateFromMemory(
-            ptr_data->cpu_addr.length(),
-            COI_BUFFER_NORMAL,
-            0,
-            const_cast<void*>(ptr_data->cpu_addr.start()),
-            1, &m_device.get_process(),
-            &ptr_data->cpu_buf);
-
-        if (res != COI_SUCCESS) {
-            if (m_status != 0) {
-                m_status->result = translate_coi_error(res);
-                return false;
-            }
-            report_coi_error(c_buf_create_from_mem, res);
-        }
-    }
-
-    if (ptr_data->mic_buf == 0) {
-        OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n",
-                      ptr_data->mic_addr);
-
-        COIRESULT res = COI::BufferCreateFromMemory(
-            ptr_data->cpu_addr.length(),
-            COI_BUFFER_NORMAL,
-            COI_SINK_MEMORY,
-            reinterpret_cast<void*>(ptr_data->mic_addr),
-            1, &m_device.get_process(),
-            &ptr_data->mic_buf);
-
-        if (res != COI_SUCCESS) {
-            if (m_status != 0) {
-                m_status->result = translate_coi_error(res);
-                return false;
-            }
-            report_coi_error(c_buf_create_from_mem, res);
-        }
-    }
-
-    return true;
-}
-
-bool OffloadDescriptor::init_mic_address(PtrData *ptr_data)
-{
-    if (ptr_data->mic_buf != 0 && ptr_data->mic_addr == 0) {
-        COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
-                                                  &ptr_data->mic_addr);
-        if (res != COI_SUCCESS) {
-            if (m_status != 0) {
-                m_status->result = translate_coi_error(res);
-            }
-            else if (m_is_mandatory) {
-                report_coi_error(c_buf_get_address, res);
-            }
-            return false;
-        }
-    }
-    return true;
-}
-
-bool OffloadDescriptor::nullify_target_stack(
-    COIBUFFER targ_buf,
-    uint64_t size
-)
-{
-    char * ptr = (char*)malloc(size);
-    COIRESULT res;
-
-    memset(ptr, 0, size);
-    res = COI::BufferWrite(
-        targ_buf,
-        0,
-        ptr,
-        size,
-        COI_COPY_UNSPECIFIED,
-        0, 0, 0);
-    free(ptr);
-    if (res != COI_SUCCESS) {
-        if (m_status != 0) {
-            m_status->result = translate_coi_error(res);
-            return false;
-        }
-        report_coi_error(c_buf_write, res);
-    }
-    return true;
-}
-
-bool OffloadDescriptor::offload_stack_memory_manager(
-    const void * stack_begin,
-    int  routine_id,
-    int  buf_size,
-    int  align,
-    bool *is_new)
-{
-    mutex_locker_t locker(stack_alloc_lock);
-
-    PersistData * new_el;
-    PersistDataList::iterator it_begin = m_device.m_persist_list.begin();
-    PersistDataList::iterator it_end;
-    int erase = 0;
-
-    *is_new = false;
-
-    for (PersistDataList::iterator it = m_device.m_persist_list.begin();
-        it != m_device.m_persist_list.end(); it++) {
-        PersistData cur_el = *it;
-
-        if (stack_begin > it->stack_cpu_addr) {
-            // this stack data must be destroyed
-            m_destroy_stack.push_front(cur_el.stack_ptr_data);
-            it_end = it;
-            erase++;
-        }
-        else if (stack_begin == it->stack_cpu_addr) {
-            if (routine_id != it-> routine_id) {
-                // this stack data must be destroyed
-                m_destroy_stack.push_front(cur_el.stack_ptr_data);
-                it_end = it;
-                erase++;
-                break;
-            }
-            else {
-                // stack data is reused
-                m_stack_ptr_data = it->stack_ptr_data;
-                if (erase > 0) {
-                    // all obsolete stack sections must be erased from the list
-                    m_device.m_persist_list.erase(it_begin, ++it_end);
-
-                    m_in_datalen +=
-                        erase * sizeof(new_el->stack_ptr_data->mic_addr);
-                }
-                OFFLOAD_TRACE(3, "Reuse of stack buffer with addr %p\n",
-                                 m_stack_ptr_data->mic_addr);
-                return true;
-            }
-        }
-        else if (stack_begin < it->stack_cpu_addr) {
-            break;
-        }
-    }
-
-    if (erase > 0) {
-        // all obsolete stack sections must be erased from the list
-        m_device.m_persist_list.erase(it_begin, ++it_end);
-        m_in_datalen += erase * sizeof(new_el->stack_ptr_data->mic_addr);
-    }
-    // new stack table is created
-    new_el = new PersistData(stack_begin, routine_id, buf_size);
-    // create MIC buffer
-    COIRESULT res;
-    uint32_t buffer_flags = 0;
-
-    // create buffer with large pages if data length exceeds
-    // large page threshold
-    if (buf_size >= __offload_use_2mb_buffers) {
-        buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
-    }
-    res = COI::BufferCreate(buf_size,
-        COI_BUFFER_NORMAL,
-        buffer_flags,
-        0,
-        1,
-        &m_device.get_process(),
-        &new_el->stack_ptr_data->mic_buf);
-    if (res != COI_SUCCESS) {
-        if (m_status != 0) {
-            m_status->result = translate_coi_error(res);
-        }
-        else if (m_is_mandatory) {
-            report_coi_error(c_buf_create, res);
-        }
-        return false;
-    }
-    // make buffer valid on the device.
-    res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
-        m_device.get_process(),
-        COI_BUFFER_VALID,
-        COI_BUFFER_NO_MOVE,
-        0, 0, 0);
-    if (res != COI_SUCCESS) {
-        if (m_status != 0) {
-            m_status->result = translate_coi_error(res);
-        }
-        else if (m_is_mandatory) {
-            report_coi_error(c_buf_set_state, res);
-        }
-        return false;
-    }
-    res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
-        COI_PROCESS_SOURCE,
-        COI_BUFFER_INVALID,
-        COI_BUFFER_NO_MOVE,
-        0, 0, 0);
-    if (res != COI_SUCCESS) {
-        if (m_status != 0) {
-            m_status->result = translate_coi_error(res);
-        }
-        else if (m_is_mandatory) {
-            report_coi_error(c_buf_set_state, res);
-        }
-        return false;
-    }
-    // persistence algorithm requires target stack initialy to be nullified
-    if (!nullify_target_stack(new_el->stack_ptr_data->mic_buf, buf_size)) {
-        return false;
-    }
-
-    m_stack_ptr_data = new_el->stack_ptr_data;
-    init_mic_address(m_stack_ptr_data);
-    OFFLOAD_TRACE(3, "Allocating stack buffer with addr %p\n",
-                      m_stack_ptr_data->mic_addr);
-    m_device.m_persist_list.push_front(*new_el);
-    init_mic_address(new_el->stack_ptr_data);
-    *is_new = true;
-    return true;
-}
-
-bool OffloadDescriptor::setup_descriptors(
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int vars_total,
-    int entry_id,
-    const void *stack_addr
-)
-{
-    COIRESULT res;
-
-    OffloadTimer timer(get_timer_data(), c_offload_host_setup_buffers);
-
-    // make a copy of variable descriptors
-    m_vars_total = vars_total;
-    if (vars_total > 0) {
-        m_vars = (VarDesc*) malloc(m_vars_total * sizeof(VarDesc));
-        memcpy(m_vars, vars, m_vars_total * sizeof(VarDesc));
-        m_vars_extra = (VarExtra*) malloc(m_vars_total * sizeof(VarExtra));
-    }
-
-    // dependencies
-    m_in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * (m_vars_total  + 1));
-    if (m_vars_total > 0) {
-        m_out_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_vars_total);
-    }
-
-    // copyin/copyout data length
-    m_in_datalen = 0;
-    m_out_datalen = 0;
-
-    // First pass over variable descriptors
-    // - Calculate size of the input and output non-pointer data
-    // - Allocate buffers for input and output pointers
-    for (int i = 0; i < m_vars_total; i++) {
-        void*   alloc_base = NULL;
-        int64_t alloc_disp = 0;
-        int64_t alloc_size;
-        bool    src_is_for_mic = (m_vars[i].direction.out ||
-                                  m_vars[i].into == NULL);
-
-        const char *var_sname = "";
-        if (vars2 != NULL && i < vars_total) {
-            if (vars2[i].sname != NULL) {
-                var_sname = vars2[i].sname;
-            }
-        }
-        OFFLOAD_TRACE(2, "   VarDesc %d, var=%s, %s, %s\n",
-            i, var_sname,
-            vardesc_direction_as_string[m_vars[i].direction.bits],
-            vardesc_type_as_string[m_vars[i].type.src]);
-        if (vars2 != NULL && i < vars_total && vars2[i].dname != NULL) {
-            OFFLOAD_TRACE(2, "              into=%s, %s\n", vars2[i].dname,
-                vardesc_type_as_string[m_vars[i].type.dst]);
-        }
-        OFFLOAD_TRACE(2,
-            "              type_src=%d, type_dstn=%d, direction=%d, "
-            "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, "
-            "offset=%lld, size=%lld, count/disp=%lld, ptr=%p, into=%p\n",
-            m_vars[i].type.src,
-            m_vars[i].type.dst,
-            m_vars[i].direction.bits,
-            m_vars[i].alloc_if,
-            m_vars[i].free_if,
-            m_vars[i].align,
-            m_vars[i].mic_offset,
-            m_vars[i].flags.bits,
-            m_vars[i].offset,
-            m_vars[i].size,
-            m_vars[i].count,
-            m_vars[i].ptr,
-            m_vars[i].into);
-
-        if (m_vars[i].alloc != NULL) {
-            // array descriptor
-            const arr_desc *ap =
-                static_cast<const arr_desc*>(m_vars[i].alloc);
-
-            // debug dump
-            __arr_desc_dump("    ", "ALLOC", ap, 0);
-
-            __arr_data_offset_and_length(ap, alloc_disp, alloc_size);
-
-            alloc_base = reinterpret_cast<void*>(ap->base);
-        }
-
-        m_vars_extra[i].cpu_disp = 0;
-        m_vars_extra[i].cpu_offset = 0;
-        m_vars_extra[i].src_data = 0;
-        m_vars_extra[i].read_rng_src = 0;
-        m_vars_extra[i].read_rng_dst = 0;
-        // flag is_arr_ptr_el is 1 only for var_descs generated
-        // for c_data_ptr_array type
-        if (i < vars_total) {
-            m_vars_extra[i].is_arr_ptr_el = 0;
-        }
-
-        switch (m_vars[i].type.src) {
-            case c_data_ptr_array:
-                {
-                    const arr_desc *ap;
-                    const VarDesc3 *vd3 =
-                        static_cast<const VarDesc3*>(m_vars[i].ptr);
-                    int flags = vd3->array_fields;
-                    OFFLOAD_TRACE(2,
-                        "              pointer array flags = %04x\n", flags);
-                    OFFLOAD_TRACE(2,
-                        "              pointer array type is %s\n",
-                        vardesc_type_as_string[flags & 0x3f]);
-                    ap = static_cast<const arr_desc*>(vd3->ptr_array);
-                    __arr_desc_dump("              ", "ptr array", ap, 0);
-                    if (m_vars[i].into) {
-                        ap = static_cast<const arr_desc*>(m_vars[i].into);
-                        __arr_desc_dump(
-                            "              ", "into array", ap, 0);
-                    }
-                    if ((flags & (1<<flag_align_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>(vd3->align_array);
-                        __arr_desc_dump(
-                            "              ", "align array", ap, 0);
-                    }
-                    if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>(vd3->alloc_if_array);
-                        __arr_desc_dump(
-                            "              ", "alloc_if array", ap, 0);
-                    }
-                    if ((flags & (1<<flag_free_if_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>(vd3->free_if_array);
-                        __arr_desc_dump(
-                            "              ", "free_if array", ap, 0);
-                    }
-                    if ((flags & (1<<flag_extent_start_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>(vd3->extent_start);
-                        __arr_desc_dump(
-                            "              ", "extent_start array", ap, 0);
-                    } else if ((flags &
-                        (1<<flag_extent_start_is_scalar)) != 0) {
-                        OFFLOAD_TRACE(2,
-                            "              extent_start scalar = %d\n",
-                            (int64_t)vd3->extent_start);
-                    }
-                    if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>
-                            (vd3->extent_elements);
-                        __arr_desc_dump(
-                            "              ", "extent_elements array", ap, 0);
-                    } else if ((flags &
-                        (1<<flag_extent_elements_is_scalar)) != 0) {
-                        OFFLOAD_TRACE(2,
-                            "              extent_elements scalar = %d\n",
-                            (int64_t)vd3->extent_elements);
-                    }
-                    if ((flags & (1<<flag_into_start_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>(vd3->into_start);
-                        __arr_desc_dump(
-                            "              ", "into_start array", ap, 0);
-                    } else if ((flags &
-                        (1<<flag_into_start_is_scalar)) != 0) {
-                        OFFLOAD_TRACE(2,
-                            "              into_start scalar = %d\n",
-                            (int64_t)vd3->into_start);
-                    }
-                    if ((flags & (1<<flag_into_elements_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>(vd3->into_elements);
-                        __arr_desc_dump(
-                            "              ", "into_elements array", ap, 0);
-                    } else if ((flags &
-                        (1<<flag_into_elements_is_scalar)) != 0) {
-                        OFFLOAD_TRACE(2,
-                            "              into_elements scalar = %d\n",
-                            (int64_t)vd3->into_elements);
-                    }
-                    if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>(vd3->alloc_start);
-                        __arr_desc_dump(
-                            "              ", "alloc_start array", ap, 0);
-                    } else if ((flags &
-                        (1<<flag_alloc_start_is_scalar)) != 0) {
-                        OFFLOAD_TRACE(2,
-                            "              alloc_start scalar = %d\n",
-                            (int64_t)vd3->alloc_start);
-                    }
-                    if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>(vd3->alloc_elements);
-                        __arr_desc_dump(
-                            "              ", "alloc_elements array", ap, 0);
-                    } else if ((flags &
-                        (1<<flag_alloc_elements_is_scalar)) != 0) {
-                        OFFLOAD_TRACE(2,
-                            "              alloc_elements scalar = %d\n",
-                            (int64_t)vd3->alloc_elements);
-                    }
-                }
-                if (!gen_var_descs_for_pointer_array(i)) {
-                    return false;
-                }
-                break;
-
-            case c_data:
-            case c_void_ptr:
-            case c_cean_var:
-                // In all uses later
-                // VarDesc.size will have the length of the data to be
-                // transferred
-                // VarDesc.disp will have an offset from base
-                if (m_vars[i].type.src == c_cean_var) {
-                    // array descriptor
-                    const arr_desc *ap =
-                        static_cast<const arr_desc*>(m_vars[i].ptr);
-
-                    // debug dump
-                    __arr_desc_dump("", "IN/OUT", ap, 0);
-
-                    // offset and length are derived from the array descriptor
-                    __arr_data_offset_and_length(ap, m_vars[i].disp,
-                                                 m_vars[i].size);
-                    if (!is_arr_desc_contiguous(ap)) {
-                        m_vars[i].flags.is_noncont_src = 1;
-                        m_vars_extra[i].read_rng_src =
-                            init_read_ranges_arr_desc(ap);
-                    }
-                    // all necessary information about length and offset is
-                    // transferred in var descriptor. There is no need to send
-                    // array descriptor to the target side.
-                    m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
-                }
-                else {
-                    m_vars[i].size *= m_vars[i].count;
-                    m_vars[i].disp = 0;
-                }
-
-                if (m_vars[i].direction.bits) {
-                    // make sure that transfer size > 0
-                    if (m_vars[i].size <= 0) {
-                        LIBOFFLOAD_ERROR(c_zero_or_neg_transfer_size);
-                        exit(1);
-                    }
-
-                    if (m_vars[i].flags.is_static) {
-                        PtrData *ptr_data;
-
-                        // find data associated with variable
-                        if (!find_ptr_data(ptr_data,
-                                           m_vars[i].ptr,
-                                           m_vars[i].disp,
-                                           m_vars[i].size,
-                                           false)) {
-                            return false;
-                        }
-
-                        if (ptr_data != 0) {
-                            // offset to base from the beginning of the buffer
-                            // memory
-                            m_vars[i].offset =
-                                (char*) m_vars[i].ptr -
-                                (char*) ptr_data->cpu_addr.start();
-                        }
-                        else {
-                            m_vars[i].flags.is_static = false;
-                            if (m_vars[i].into == NULL) {
-                                m_vars[i].flags.is_static_dstn = false;
-                            }
-                        }
-                        m_vars_extra[i].src_data = ptr_data;
-                    }
-
-                    if (m_is_openmp) {
-                        if (m_vars[i].flags.is_static) {
-                            // Static data is transferred only by omp target
-                            // update construct which passes zeros for
-                            // alloc_if and free_if.
-                            if (m_vars[i].alloc_if || m_vars[i].free_if) {
-                                m_vars[i].direction.bits = c_parameter_nocopy;
-                            }
-                        }
-                        else {
-                            AutoData *auto_data;
-                            if (m_vars[i].alloc_if) {
-                                auto_data = m_device.insert_auto_data(
-                                    m_vars[i].ptr, m_vars[i].size);
-                                auto_data->add_reference();
-                            }
-                            else {
-                                // TODO: what should be done if var is not in
-                                // the table?
-                                auto_data = m_device.find_auto_data(
-                                    m_vars[i].ptr);
-                            }
-
-                            // For automatic variables data is transferred
-                            // only if alloc_if == 0 && free_if == 0
-                            // or reference count is 1
-                            if ((m_vars[i].alloc_if || m_vars[i].free_if) &&
-                                auto_data != 0 &&
-                                auto_data->get_reference() != 1) {
-                                m_vars[i].direction.bits = c_parameter_nocopy;
-                            }
-
-                            // save data for later use
-                            m_vars_extra[i].auto_data = auto_data;
-                        }
-                    }
-
-                    if (m_vars[i].direction.in &&
-                        !m_vars[i].flags.is_static) {
-                        m_in_datalen += m_vars[i].size;
-
-                        // for non-static target destination defined as CEAN
-                        // expression we pass to target its size and dist
-                        if (m_vars[i].into == NULL &&
-                            m_vars[i].type.src == c_cean_var) {
-                            m_in_datalen += 2 * sizeof(uint64_t);
-                        }
-                        m_need_runfunction = true;
-                    }
-                    if (m_vars[i].direction.out &&
-                        !m_vars[i].flags.is_static) {
-                        m_out_datalen += m_vars[i].size;
-                        m_need_runfunction = true;
-                    }
-                }
-                break;
-
-            case c_dv:
-                if (m_vars[i].direction.bits ||
-                    m_vars[i].alloc_if ||
-                    m_vars[i].free_if) {
-                    ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].ptr);
-
-                    // debug dump
-                    __dv_desc_dump("IN/OUT", dvp);
-
-                    // send dope vector contents excluding base
-                    m_in_datalen += m_vars[i].size - sizeof(uint64_t);
-                    m_need_runfunction = true;
-                }
-                break;
-
-            case c_string_ptr:
-                if ((m_vars[i].direction.bits ||
-                     m_vars[i].alloc_if ||
-                     m_vars[i].free_if) &&
-                    m_vars[i].size == 0) {
-                    m_vars[i].size = 1;
-                    m_vars[i].count =
-                        strlen(*static_cast<char**>(m_vars[i].ptr)) + 1;
-                }
-                /* fallthru */
-
-            case c_data_ptr:
-                if (m_vars[i].flags.is_stack_buf &&
-                    !m_vars[i].direction.bits &&
-                    m_vars[i].alloc_if) {
-                    // this var_desc is for stack buffer
-                    bool is_new;
-
-                    if (!offload_stack_memory_manager(
-                            stack_addr, entry_id,
-                            m_vars[i].count, m_vars[i].align, &is_new)) {
-                        return false;
-                    }
-                    if (is_new) {
-                        m_compute_buffers.push_back(
-                            m_stack_ptr_data->mic_buf);
-                        m_device.m_persist_list.front().cpu_stack_addr =
-                            static_cast<char*>(m_vars[i].ptr);
-                    }
-                    else {
-                        m_vars[i].flags.sink_addr = 1;
-                        m_in_datalen += sizeof(m_stack_ptr_data->mic_addr);
-                    }
-                    m_vars[i].size = m_destroy_stack.size();
-                    m_vars_extra[i].src_data = m_stack_ptr_data;
-                    // need to add reference for buffer
-                    m_need_runfunction = true;
-                    break;
-                }
-                /* fallthru */
-
-            case c_cean_var_ptr:
-            case c_dv_ptr:
-                if (m_vars[i].type.src == c_cean_var_ptr) {
-                    // array descriptor
-                    const arr_desc *ap =
-                        static_cast<const arr_desc*>(m_vars[i].ptr);
-
-                    // debug dump
-                    __arr_desc_dump("", "IN/OUT", ap, 1);
-
-                    // offset and length are derived from the array descriptor
-                    __arr_data_offset_and_length(ap, m_vars[i].disp,
-                                                 m_vars[i].size);
-
-                    if (!is_arr_desc_contiguous(ap)) {
-                        m_vars[i].flags.is_noncont_src = 1;
-                        m_vars_extra[i].read_rng_src =
-                            init_read_ranges_arr_desc(ap);
-                    }
-                    // all necessary information about length and offset is
-                    // transferred in var descriptor. There is no need to send
-                    // array descriptor to the target side.
-                    m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
-                }
-                else if (m_vars[i].type.src == c_dv_ptr) {
-                    // need to send DV to the device unless it is 'nocopy'
-                    if (m_vars[i].direction.bits ||
-                        m_vars[i].alloc_if ||
-                        m_vars[i].free_if) {
-                        ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].ptr);
-
-                        // debug dump
-                        __dv_desc_dump("IN/OUT", dvp);
-
-                        m_vars[i].direction.bits = c_parameter_in;
-                    }
-
-                    // no displacement
-                    m_vars[i].disp = 0;
-                }
-                else {
-                    // c_data_ptr or c_string_ptr
-                    m_vars[i].size *= m_vars[i].count;
-                    m_vars[i].disp = 0;
-                }
-
-                if (m_vars[i].direction.bits ||
-                    m_vars[i].alloc_if ||
-                    m_vars[i].free_if) {
-                    PtrData *ptr_data;
-
-                    // check that buffer length >= 0
-                    if (m_vars[i].alloc_if &&
-                        m_vars[i].disp + m_vars[i].size < 0) {
-                        LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
-                        exit(1);
-                    }
-
-                    // base address
-                    void *base = *static_cast<void**>(m_vars[i].ptr);
-
-                    // allocate buffer if we have no INTO and don't need
-                    // allocation for the ptr at target
-                    if (src_is_for_mic) {
-                        if (m_vars[i].flags.is_stack_buf) {
-                            // for stack persistent objects ptr data is created
-                            // by var_desc with number 0.
-                            // Its ptr_data is stored at m_stack_ptr_data
-                            ptr_data = m_stack_ptr_data;
-                            m_vars[i].flags.sink_addr = 1;
-                        }
-                        else if (m_vars[i].alloc_if) {
-                            // add new entry
-                            if (!alloc_ptr_data(
-                                    ptr_data,
-                                    base,
-                                    (alloc_base != NULL) ?
-                                        alloc_disp : m_vars[i].disp,
-                                    (alloc_base != NULL) ?
-                                        alloc_size : m_vars[i].size,
-                                    alloc_disp,
-                                    (alloc_base != NULL) ?
-                                        0 : m_vars[i].align)) {
-                                return false;
-                            }
-
-                            if (ptr_data->add_reference() == 0 &&
-                                ptr_data->mic_buf != 0) {
-                                // add buffer to the list of buffers that
-                                // are passed to dispatch call
-                                m_compute_buffers.push_back(
-                                    ptr_data->mic_buf);
-                            }
-                            else {
-                                // will send buffer address to device
-                                m_vars[i].flags.sink_addr = 1;
-                            }
-
-                            if (!ptr_data->is_static) {
-                                // need to add reference for buffer
-                                m_need_runfunction = true;
-                            }
-                        }
-                        else {
-                            bool error_if_not_found = true;
-                            if (m_is_openmp) {
-                                // For omp target update variable is ignored
-                                // if it does not exist.
-                                if (!m_vars[i].alloc_if &&
-                                    !m_vars[i].free_if) {
-                                    error_if_not_found = false;
-                                }
-                            }
-
-                            // use existing association from pointer table
-                            if (!find_ptr_data(ptr_data,
-                                               base,
-                                               m_vars[i].disp,
-                                               m_vars[i].size,
-                                               error_if_not_found)) {
-                                return false;
-                            }
-
-                            if (m_is_openmp) {
-                                // make var nocopy if it does not exist
-                                if (ptr_data == 0) {
-                                    m_vars[i].direction.bits =
-                                        c_parameter_nocopy;
-                                }
-                            }
-
-                            if (ptr_data != 0) {
-                                m_vars[i].flags.sink_addr = 1;
-                            }
-                        }
-
-                        if (ptr_data != 0) {
-                            if (m_is_openmp) {
-                                // data is transferred only if
-                                // alloc_if == 0 && free_if == 0
-                                // or reference count is 1
-                                if ((m_vars[i].alloc_if ||
-                                     m_vars[i].free_if) &&
-                                    ptr_data->get_reference() != 1) {
-                                    m_vars[i].direction.bits =
-                                        c_parameter_nocopy;
-                                }
-                            }
-
-                            if (ptr_data->alloc_disp != 0) {
-                                m_vars[i].flags.alloc_disp = 1;
-                                m_in_datalen += sizeof(alloc_disp);
-                            }
-
-                            if (m_vars[i].flags.sink_addr) {
-                                // get buffers's address on the sink
-                                if (!init_mic_address(ptr_data)) {
-                                    return false;
-                                }
-
-                                m_in_datalen += sizeof(ptr_data->mic_addr);
-                            }
-
-                            if (!ptr_data->is_static && m_vars[i].free_if) {
-                                // need to decrement buffer reference on target
-                                m_need_runfunction = true;
-                            }
-
-                            // offset to base from the beginning of the buffer
-                            // memory
-                            m_vars[i].offset = (char*) base -
-                                (char*) ptr_data->cpu_addr.start();
-
-                            // copy other pointer properties to var descriptor
-                            m_vars[i].mic_offset = ptr_data->mic_offset;
-                            m_vars[i].flags.is_static = ptr_data->is_static;
-                        }
-                    }
-                    else {
-                        if (!find_ptr_data(ptr_data,
-                                           base,
-                                           m_vars[i].disp,
-                                           m_vars[i].size,
-                                           false)) {
-                            return false;
-                        }
-                        if (ptr_data) {
-                            m_vars[i].offset =
-                                (char*) base -
-                                (char*) ptr_data->cpu_addr.start();
-                        }
-                    }
-
-                    // save pointer data
-                    m_vars_extra[i].src_data = ptr_data;
-                }
-                break;
-
-            case c_func_ptr:
-                if (m_vars[i].direction.in) {
-                    m_in_datalen += __offload_funcs.max_name_length();
-                }
-                if (m_vars[i].direction.out) {
-                    m_out_datalen += __offload_funcs.max_name_length();
-                }
-                m_need_runfunction = true;
-                break;
-
-            case c_dv_data:
-            case c_dv_ptr_data:
-            case c_dv_data_slice:
-            case c_dv_ptr_data_slice:
-                ArrDesc *dvp;
-                if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
-                    const arr_desc *ap;
-                    ap = static_cast<const arr_desc*>(m_vars[i].ptr);
-
-                    dvp = (m_vars[i].type.src == c_dv_data_slice) ?
-                          reinterpret_cast<ArrDesc*>(ap->base) :
-                          *reinterpret_cast<ArrDesc**>(ap->base);
-                }
-                else {
-                    dvp = (m_vars[i].type.src == c_dv_data) ?
-                          static_cast<ArrDesc*>(m_vars[i].ptr) :
-                          *static_cast<ArrDesc**>(m_vars[i].ptr);
-                }
-
-                // if allocatable dope vector isn't allocated don't
-                // transfer its data
-                if (!__dv_is_allocated(dvp)) {
-                    m_vars[i].direction.bits = c_parameter_nocopy;
-                    m_vars[i].alloc_if = 0;
-                    m_vars[i].free_if = 0;
-                }
-                if (m_vars[i].direction.bits ||
-                    m_vars[i].alloc_if ||
-                    m_vars[i].free_if) {
-                    const arr_desc *ap;
-
-                    if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
-                        ap = static_cast<const arr_desc*>(m_vars[i].ptr);
-
-                        // debug dump
-                        __arr_desc_dump("", "IN/OUT", ap, 0);
-                    }
-                    if (!__dv_is_contiguous(dvp)) {
-                        m_vars[i].flags.is_noncont_src = 1;
-                        m_vars_extra[i].read_rng_src =
-                            init_read_ranges_dv(dvp);
-                    }
-
-                    // size and displacement
-                    if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
-                        // offset and length are derived from the
-                        // array descriptor
-                        __arr_data_offset_and_length(ap,
-                                                     m_vars[i].disp,
-                                                     m_vars[i].size);
-                        if (m_vars[i].direction.bits) {
-                            if (!is_arr_desc_contiguous(ap)) {
-                                if (m_vars[i].flags.is_noncont_src) {
-                                    LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
-                                    return false;
-                                }
-                                m_vars[i].flags.is_noncont_src = 1;
-                                m_vars_extra[i].read_rng_src =
-                                    init_read_ranges_arr_desc(ap);
-                            }
-                        }
-                    }
-                    else {
-                        if (m_vars[i].flags.has_length) {
-                            m_vars[i].size =
-                                __dv_data_length(dvp, m_vars[i].count);
-                        }
-                        else {
-                            m_vars[i].size = __dv_data_length(dvp);
-                        }
-                        m_vars[i].disp = 0;
-                    }
-
-                    // check that length >= 0
-                    if (m_vars[i].alloc_if &&
-                        (m_vars[i].disp + m_vars[i].size < 0)) {
-                        LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
-                        exit(1);
-                    }
-
-                    // base address
-                    void *base = reinterpret_cast<void*>(dvp->Base);
-                    PtrData *ptr_data;
-
-                    // allocate buffer if we have no INTO and don't need
-                    // allocation for the ptr at target
-                    if (src_is_for_mic) {
-                        if (m_vars[i].alloc_if) {
-                            // add new entry
-                            if (!alloc_ptr_data(
-                                    ptr_data,
-                                    base,
-                                    (alloc_base != NULL) ?
-                                        alloc_disp : m_vars[i].disp,
-                                    (alloc_base != NULL) ?
-                                        alloc_size : m_vars[i].size,
-                                    alloc_disp,
-                                    (alloc_base != NULL) ?
-                                        0 : m_vars[i].align)) {
-                                return false;
-                            }
-
-                            if (ptr_data->add_reference() == 0 &&
-                                ptr_data->mic_buf != 0) {
-                                // add buffer to the list of buffers
-                                // that are passed to dispatch call
-                                m_compute_buffers.push_back(
-                                    ptr_data->mic_buf);
-                            }
-                            else {
-                                // will send buffer address to device
-                                m_vars[i].flags.sink_addr = 1;
-                            }
-
-                            if (!ptr_data->is_static) {
-                                // need to add reference for buffer
-                                m_need_runfunction = true;
-                            }
-                        }
-                        else {
-                            bool error_if_not_found = true;
-                            if (m_is_openmp) {
-                                // For omp target update variable is ignored
-                                // if it does not exist.
-                                if (!m_vars[i].alloc_if &&
-                                    !m_vars[i].free_if) {
-                                    error_if_not_found = false;
-                                }
-                            }
-
-                            // use existing association from pointer table
-                            if (!find_ptr_data(ptr_data,
-                                               base,
-                                               m_vars[i].disp,
-                                               m_vars[i].size,
-                                               error_if_not_found)) {
-                                return false;
-                            }
-
-                            if (m_is_openmp) {
-                                // make var nocopy if it does not exist
-                                if (ptr_data == 0) {
-                                    m_vars[i].direction.bits =
-                                        c_parameter_nocopy;
-                                }
-                            }
-
-                            if (ptr_data != 0) {
-                                // need to update base in dope vector on device
-                                m_vars[i].flags.sink_addr = 1;
-                            }
-                        }
-
-                        if (ptr_data != 0) {
-                            if (m_is_openmp) {
-                                // data is transferred only if
-                                // alloc_if == 0 && free_if == 0
-                                // or reference count is 1
-                                if ((m_vars[i].alloc_if ||
-                                     m_vars[i].free_if) &&
-                                    ptr_data->get_reference() != 1) {
-                                    m_vars[i].direction.bits =
-                                        c_parameter_nocopy;
-                                }
-                            }
-
-                            if (ptr_data->alloc_disp != 0) {
-                                m_vars[i].flags.alloc_disp = 1;
-                                m_in_datalen += sizeof(alloc_disp);
-                            }
-
-                            if (m_vars[i].flags.sink_addr) {
-                                // get buffers's address on the sink
-                                if (!init_mic_address(ptr_data)) {
-                                    return false;
-                                }
-
-                                m_in_datalen += sizeof(ptr_data->mic_addr);
-                            }
-
-                            if (!ptr_data->is_static && m_vars[i].free_if) {
-                                // need to decrement buffer reference on target
-                                m_need_runfunction = true;
-                            }
-
-                            // offset to base from the beginning of the buffer
-                            // memory
-                            m_vars[i].offset =
-                                (char*) base -
-                                (char*) ptr_data->cpu_addr.start();
-
-                            // copy other pointer properties to var descriptor
-                            m_vars[i].mic_offset = ptr_data->mic_offset;
-                            m_vars[i].flags.is_static = ptr_data->is_static;
-                        }
-                    }
-                    else { // !src_is_for_mic
-                        if (!find_ptr_data(ptr_data,
-                                           base,
-                                           m_vars[i].disp,
-                                           m_vars[i].size,
-                                           false)) {
-                            return false;
-                        }
-                        m_vars[i].offset = !ptr_data ? 0 :
-                                (char*) base -
-                                (char*) ptr_data->cpu_addr.start();
-                    }
-
-                    // save pointer data
-                    m_vars_extra[i].src_data = ptr_data;
-                }
-                break;
-
-            default:
-                LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.src);
-                LIBOFFLOAD_ABORT;
-        }
-        if (m_vars[i].type.src == c_data_ptr_array) {
-            continue;
-        }
-
-        if (src_is_for_mic && m_vars[i].flags.is_stack_buf) {
-            m_vars[i].offset = static_cast<char*>(m_vars[i].ptr) -
-                m_device.m_persist_list.front().cpu_stack_addr;
-        }
-        // if source is used at CPU save its offset and disp
-        if (m_vars[i].into == NULL || m_vars[i].direction.in) {
-            m_vars_extra[i].cpu_offset = m_vars[i].offset;
-            m_vars_extra[i].cpu_disp   = m_vars[i].disp;
-        }
-
-        // If "into" is define we need to do the similar work for it
-        if (!m_vars[i].into) {
-            continue;
-        }
-
-        int64_t into_disp =0, into_offset = 0;
-
-        switch (m_vars[i].type.dst) {
-            case c_data_ptr_array:
-                break;
-            case c_data:
-            case c_void_ptr:
-            case c_cean_var: {
-                int64_t size = m_vars[i].size;
-
-                if (m_vars[i].type.dst == c_cean_var) {
-                    // array descriptor
-                    const arr_desc *ap =
-                        static_cast<const arr_desc*>(m_vars[i].into);
-
-                    // debug dump
-                    __arr_desc_dump("    ", "INTO", ap, 0);
-
-                    // offset and length are derived from the array descriptor
-                    __arr_data_offset_and_length(ap, into_disp, size);
-
-                    if (!is_arr_desc_contiguous(ap)) {
-                        m_vars[i].flags.is_noncont_dst = 1;
-                        m_vars_extra[i].read_rng_dst =
-                            init_read_ranges_arr_desc(ap);
-                        if (!cean_ranges_match(
-                            m_vars_extra[i].read_rng_src,
-                            m_vars_extra[i].read_rng_dst)) {
-                            LIBOFFLOAD_ERROR(c_ranges_dont_match);
-                            exit(1);
-                        }
-                    }
-                    m_vars[i].into = reinterpret_cast<void*>(ap->base);
-                }
-
-                int64_t size_src = m_vars_extra[i].read_rng_src ?
-                    cean_get_transf_size(m_vars_extra[i].read_rng_src) :
-                    m_vars[i].size;
-                int64_t size_dst = m_vars_extra[i].read_rng_dst ?
-                    cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
-                    size;
-                // It's supposed that "into" size must be not less
-                // than src size
-                if (size_src > size_dst) {
-                    LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
-                                     size_src, size_dst);
-                    exit(1);
-                }
-
-                if (m_vars[i].direction.bits) {
-                    if (m_vars[i].flags.is_static_dstn) {
-                        PtrData *ptr_data;
-
-                        // find data associated with variable
-                        if (!find_ptr_data(ptr_data, m_vars[i].into,
-                                           into_disp, size, false)) {
-                            return false;
-                        }
-                        if (ptr_data != 0) {
-                            // offset to base from the beginning of the buffer
-                            // memory
-                            into_offset =
-                                (char*) m_vars[i].into -
-                                (char*) ptr_data->cpu_addr.start();
-                        }
-                        else {
-                            m_vars[i].flags.is_static_dstn = false;
-                        }
-                        m_vars_extra[i].dst_data = ptr_data;
-                    }
-                }
-
-                if (m_vars[i].direction.in &&
-                    !m_vars[i].flags.is_static_dstn) {
-                    m_in_datalen += m_vars[i].size;
-
-                    // for non-static target destination defined as CEAN
-                    // expression we pass to target its size and dist
-                    if (m_vars[i].type.dst == c_cean_var) {
-                        m_in_datalen += 2 * sizeof(uint64_t);
-                    }
-                    m_need_runfunction = true;
-                }
-                break;
-            }
-
-            case c_dv:
-                if (m_vars[i].direction.bits ||
-                    m_vars[i].alloc_if ||
-                    m_vars[i].free_if) {
-                    ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].into);
-
-                    // debug dump
-                    __dv_desc_dump("INTO", dvp);
-
-                    // send dope vector contents excluding base
-                    m_in_datalen += m_vars[i].size - sizeof(uint64_t);
-                    m_need_runfunction = true;
-                }
-                break;
-
-            case c_string_ptr:
-            case c_data_ptr:
-            case c_cean_var_ptr:
-            case c_dv_ptr: {
-                int64_t size = m_vars[i].size;
-
-                if (m_vars[i].type.dst == c_cean_var_ptr) {
-                    // array descriptor
-                    const arr_desc *ap =
-                        static_cast<const arr_desc*>(m_vars[i].into);
-
-                    // debug dump
-                    __arr_desc_dump("    ", "INTO", ap, 1);
-
-                    // offset and length are derived from the array descriptor
-                    __arr_data_offset_and_length(ap, into_disp, size);
-
-                    if (!is_arr_desc_contiguous(ap)) {
-                        m_vars[i].flags.is_noncont_src = 1;
-                        m_vars_extra[i].read_rng_dst =
-                            init_read_ranges_arr_desc(ap);
-                        if (!cean_ranges_match(
-                            m_vars_extra[i].read_rng_src,
-                            m_vars_extra[i].read_rng_dst)) {
-                            LIBOFFLOAD_ERROR(c_ranges_dont_match);
-                        }
-                    }
-                    m_vars[i].into = reinterpret_cast<char**>(ap->base);
-                }
-                else if (m_vars[i].type.dst == c_dv_ptr) {
-                    // need to send DV to the device unless it is 'nocopy'
-                    if (m_vars[i].direction.bits ||
-                        m_vars[i].alloc_if ||
-                        m_vars[i].free_if) {
-                        ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].into);
-
-                        // debug dump
-                        __dv_desc_dump("INTO", dvp);
-
-                        m_vars[i].direction.bits = c_parameter_in;
-                    }
-                }
-
-                int64_t size_src = m_vars_extra[i].read_rng_src ?
-                    cean_get_transf_size(m_vars_extra[i].read_rng_src) :
-                    m_vars[i].size;
-                int64_t size_dst = m_vars_extra[i].read_rng_dst ?
-                    cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
-                    size;
-                // It's supposed that "into" size must be not less than
-                // src size
-                if (size_src > size_dst) {
-                    LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
-                                     size_src, size_dst);
-                    exit(1);
-                }
-
-                if (m_vars[i].direction.bits) {
-                    PtrData *ptr_data;
-
-                    // base address
-                    void *base = *static_cast<void**>(m_vars[i].into);
-
-                    if (m_vars[i].direction.in) {
-                        // allocate buffer
-                        if (m_vars[i].flags.is_stack_buf) {
-                            // for stack persistent objects ptr data is created
-                            // by var_desc with number 0.
-                            // Its ptr_data is stored at m_stack_ptr_data
-                            ptr_data = m_stack_ptr_data;
-                            m_vars[i].flags.sink_addr = 1;
-                        }
-                        else if (m_vars[i].alloc_if) {
-                            // add new entry
-                            if (!alloc_ptr_data(
-                                    ptr_data,
-                                    base,
-                                    (alloc_base != NULL) ?
-                                        alloc_disp : into_disp,
-                                    (alloc_base != NULL) ?
-                                        alloc_size : size,
-                                    alloc_disp,
-                                    (alloc_base != NULL) ?
-                                        0 : m_vars[i].align)) {
-                                return false;
-                            }
-
-                            if (ptr_data->add_reference() == 0 &&
-                                ptr_data->mic_buf != 0) {
-                                // add buffer to the list of buffers that
-                                // are passed to dispatch call
-                                m_compute_buffers.push_back(
-                                    ptr_data->mic_buf);
-                            }
-                            else {
-                                // will send buffer address to device
-                                m_vars[i].flags.sink_addr = 1;
-                            }
-
-                            if (!ptr_data->is_static) {
-                                // need to add reference for buffer
-                                m_need_runfunction = true;
-                            }
-                        }
-                        else {
-                            // use existing association from pointer table
-                            if (!find_ptr_data(ptr_data, base, into_disp, size)) {
-                                return false;
-                            }
-                            m_vars[i].flags.sink_addr = 1;
-                        }
-
-                        if (ptr_data->alloc_disp != 0) {
-                            m_vars[i].flags.alloc_disp = 1;
-                            m_in_datalen += sizeof(alloc_disp);
-                        }
-
-                        if (m_vars[i].flags.sink_addr) {
-                            // get buffers's address on the sink
-                            if (!init_mic_address(ptr_data)) {
-                                return false;
-                            }
-
-                            m_in_datalen += sizeof(ptr_data->mic_addr);
-                        }
-
-                        if (!ptr_data->is_static && m_vars[i].free_if) {
-                            // need to decrement buffer reference on target
-                            m_need_runfunction = true;
-                        }
-
-                        // copy other pointer properties to var descriptor
-                        m_vars[i].mic_offset = ptr_data->mic_offset;
-                        m_vars[i].flags.is_static_dstn = ptr_data->is_static;
-                    }
-                    else {
-                        if (!find_ptr_data(ptr_data,
-                                           base,
-                                           into_disp,
-                                           m_vars[i].size,
-                                           false)) {
-                            return false;
-                        }
-                    }
-                    if (ptr_data) {
-                        into_offset = ptr_data ?
-                            (char*) base -
-                            (char*) ptr_data->cpu_addr.start() :
-                            0;
-                    }
-                    // save pointer data
-                    m_vars_extra[i].dst_data = ptr_data;
-                }
-                break;
-            }
-
-            case c_func_ptr:
-                break;
-
-            case c_dv_data:
-            case c_dv_ptr_data:
-            case c_dv_data_slice:
-            case c_dv_ptr_data_slice:
-                if (m_vars[i].direction.bits ||
-                    m_vars[i].alloc_if ||
-                    m_vars[i].free_if) {
-                    const arr_desc *ap;
-                    ArrDesc *dvp;
-                    PtrData *ptr_data;
-                    int64_t disp;
-                    int64_t size;
-
-                    if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) {
-                        ap = static_cast<const arr_desc*>(m_vars[i].into);
-
-                        // debug dump
-                        __arr_desc_dump("    ", "INTO", ap, 0);
-
-                        dvp = (m_vars[i].type.dst == c_dv_data_slice) ?
-                              reinterpret_cast<ArrDesc*>(ap->base) :
-                              *reinterpret_cast<ArrDesc**>(ap->base);
-                    }
-                    else {
-                        dvp = (m_vars[i].type.dst == c_dv_data) ?
-                              static_cast<ArrDesc*>(m_vars[i].into) :
-                              *static_cast<ArrDesc**>(m_vars[i].into);
-                    }
-                    if (!__dv_is_contiguous(dvp)) {
-                        m_vars[i].flags.is_noncont_dst = 1;
-                        m_vars_extra[i].read_rng_dst =
-                            init_read_ranges_dv(dvp);
-                    }
-                    // size and displacement
-                    if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) {
-                        // offset and length are derived from the array
-                        // descriptor
-                        __arr_data_offset_and_length(ap, into_disp, size);
-                        if (m_vars[i].direction.bits) {
-                            if (!is_arr_desc_contiguous(ap)) {
-                                if (m_vars[i].flags.is_noncont_dst) {
-                                    LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
-                                    return false;
-                                }
-                                m_vars[i].flags.is_noncont_dst = 1;
-                                m_vars_extra[i].read_rng_dst =
-                                    init_read_ranges_arr_desc(ap);
-                                if (!cean_ranges_match(
-                                    m_vars_extra[i].read_rng_src,
-                                    m_vars_extra[i].read_rng_dst)) {
-                                    LIBOFFLOAD_ERROR(c_ranges_dont_match);
-                                }
-                            }
-                        }
-                    }
-                    else {
-                        if (m_vars[i].flags.has_length) {
-                            size = __dv_data_length(dvp, m_vars[i].count);
-                        }
-                        else {
-                            size = __dv_data_length(dvp);
-                        }
-                        disp = 0;
-                    }
-
-                    int64_t size_src =
-                        m_vars_extra[i].read_rng_src ?
-                        cean_get_transf_size(m_vars_extra[i].read_rng_src) :
-                        m_vars[i].size;
-                    int64_t size_dst =
-                        m_vars_extra[i].read_rng_dst ?
-                        cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
-                        size;
-                    // It's supposed that "into" size must be not less
-                    // than src size
-                    if (size_src > size_dst) {
-                        LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
-                            size_src, size_dst);
-                        exit(1);
-                    }
-
-                    // base address
-                    void *base = reinterpret_cast<void*>(dvp->Base);
-
-                    // allocate buffer
-                    if (m_vars[i].direction.in) {
-                        if (m_vars[i].alloc_if) {
-                            // add new entry
-                            if (!alloc_ptr_data(
-                                    ptr_data,
-                                    base,
-                                    (alloc_base != NULL) ?
-                                        alloc_disp : into_disp,
-                                    (alloc_base != NULL) ?
-                                        alloc_size : size,
-                                    alloc_disp,
-                                    (alloc_base != NULL) ?
-                                        0 : m_vars[i].align)) {
-                                return false;
-                            }
-                            if (ptr_data->add_reference() == 0 &&
-                                ptr_data->mic_buf !=0) {
-                                // add buffer to the list of buffers
-                                // that are passed to dispatch call
-                                m_compute_buffers.push_back(
-                                    ptr_data->mic_buf);
-                            }
-                            else {
-                                // will send buffer address to device
-                                m_vars[i].flags.sink_addr = 1;
-                            }
-
-                            if (!ptr_data->is_static) {
-                                // need to add reference for buffer
-                                m_need_runfunction = true;
-                            }
-                        }
-                        else {
-                            // use existing association from pointer table
-                            if (!find_ptr_data(ptr_data, base, into_disp, size)) {
-                                return false;
-                            }
-
-                            // need to update base in dope vector on device
-                            m_vars[i].flags.sink_addr = 1;
-                        }
-
-                        if (ptr_data->alloc_disp != 0) {
-                            m_vars[i].flags.alloc_disp = 1;
-                            m_in_datalen += sizeof(alloc_disp);
-                        }
-
-                        if (m_vars[i].flags.sink_addr) {
-                            // get buffers's address on the sink
-                            if (!init_mic_address(ptr_data)) {
-                                return false;
-                            }
-                            m_in_datalen += sizeof(ptr_data->mic_addr);
-                        }
-
-                        if (!ptr_data->is_static && m_vars[i].free_if) {
-                            // need to decrement buffer reference on target
-                            m_need_runfunction = true;
-                        }
-
-                        // offset to base from the beginning of the buffer
-                        // memory
-                        into_offset =
-                            (char*) base - (char*) ptr_data->cpu_addr.start();
-
-                        // copy other pointer properties to var descriptor
-                        m_vars[i].mic_offset = ptr_data->mic_offset;
-                        m_vars[i].flags.is_static_dstn = ptr_data->is_static;
-                    }
-                    else { // src_is_for_mic
-                        if (!find_ptr_data(ptr_data,
-                                           base,
-                                           into_disp,
-                                           size,
-                                           false)) {
-                            return false;
-                        }
-                        into_offset = !ptr_data ?
-                            0 :
-                            (char*) base - (char*) ptr_data->cpu_addr.start();
-                    }
-
-                    // save pointer data
-                    m_vars_extra[i].dst_data = ptr_data;
-                }
-                break;
-
-            default:
-                LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.src);
-                LIBOFFLOAD_ABORT;
-        }
-        // if into is used at CPU save its offset and disp
-        if (m_vars[i].direction.out) {
-            m_vars_extra[i].cpu_offset = into_offset;
-            m_vars_extra[i].cpu_disp   = into_disp;
-        }
-        else {
-            if (m_vars[i].flags.is_stack_buf) {
-                into_offset = static_cast<char*>(m_vars[i].into) -
-                    m_device.m_persist_list.front().cpu_stack_addr;
-            }
-            m_vars[i].offset = into_offset;
-            m_vars[i].disp   = into_disp;
-        }
-    }
-
-    return true;
-}
-
-bool OffloadDescriptor::setup_misc_data(const char *name)
-{
-    OffloadTimer timer(get_timer_data(), c_offload_host_setup_misc_data);
-
-    // we can skip run functon call together with wait if offloaded
-    // region is empty and there is no user defined non-pointer IN/OUT data
-    if (m_need_runfunction) {
-        // variable descriptors are sent as input data
-        m_in_datalen += m_vars_total * sizeof(VarDesc);
-
-        // timer data is sent as a part of the output data
-        m_out_datalen += OFFLOAD_TIMER_DATALEN();
-
-        // max from input data and output data length
-        uint64_t data_len = m_in_datalen > m_out_datalen ? m_in_datalen :
-                                                           m_out_datalen;
-
-        // Misc data has the following layout
-        //     <Function Descriptor>
-        //     <Function Name>
-        //     <In/Out Data>            (optional)
-        //
-        // We can transfer copyin/copyout data in misc/return data which can
-        // be passed to run function call if its size does not exceed
-        // COI_PIPELINE_MAX_IN_MISC_DATA_LEN. Otherwise we have to allocate
-        // buffer for it.
-
-        m_func_desc_size = sizeof(FunctionDescriptor) + strlen(name) + 1;
-        m_func_desc_size = (m_func_desc_size + 7) & ~7;
-
-        int misc_data_offset = 0;
-        int misc_data_size = 0;
-        if (data_len > 0) {
-            if (m_func_desc_size +
-                m_in_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN &&
-                m_out_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN) {
-                // use misc/return data for copyin/copyout
-                misc_data_offset = m_func_desc_size;
-                misc_data_size = data_len;
-            }
-            else {
-                OffloadTimer timer_buf(get_timer_data(),
-                                       c_offload_host_alloc_data_buffer);
-
-                // send/receive data using buffer
-                COIRESULT res = COI::BufferCreate(data_len,
-                                                  COI_BUFFER_NORMAL,
-                                                  0, 0,
-                                                  1, &m_device.get_process(),
-                                                  &m_inout_buf);
-                if (res != COI_SUCCESS) {
-                    if (m_status != 0) {
-                        m_status->result = translate_coi_error(res);
-                        return false;
-                    }
-                    report_coi_error(c_buf_create, res);
-                }
-
-                m_compute_buffers.push_back(m_inout_buf);
-                m_destroy_buffers.push_back(m_inout_buf);
-            }
-        }
-
-        // initialize function descriptor
-        m_func_desc = (FunctionDescriptor*) malloc(m_func_desc_size +
-                                                   misc_data_size);
-        m_func_desc->console_enabled = console_enabled;
-        m_func_desc->timer_enabled =
-            timer_enabled || (offload_report_level && offload_report_enabled);
-        m_func_desc->offload_report_level = offload_report_level;
-        m_func_desc->offload_number = GET_OFFLOAD_NUMBER(get_timer_data());
-        m_func_desc->in_datalen = m_in_datalen;
-        m_func_desc->out_datalen = m_out_datalen;
-        m_func_desc->vars_num = m_vars_total;
-        m_func_desc->data_offset = misc_data_offset;
-
-        // append entry name
-        strcpy(m_func_desc->data, name);
-    }
-
-    return true;
-}
-
-bool OffloadDescriptor::wait_dependencies(
-    const void **waits,
-    int num_waits
-)
-{
-    OffloadTimer timer(get_timer_data(), c_offload_host_wait_deps);
-    bool ret = true;
-
-    for (int i = 0; i < num_waits; i++) {
-
-        OffloadDescriptor *task = m_device.find_signal(waits[i], true);
-        if (task == 0) {
-            LIBOFFLOAD_ERROR(c_offload1, m_device.get_logical_index(),
-                             waits[i]);
-            LIBOFFLOAD_ABORT;
-        }
-
-        if (!task->offload_finish()) {
-            ret = false;
-        }
-
-        task->cleanup();
-        delete task;
-    }
-
-    return ret;
-}
-
-bool OffloadDescriptor::offload(
-    const char *name,
-    bool is_empty,
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int vars_total,
-    const void **waits,
-    int num_waits,
-    const void **signal,
-    int entry_id,
-    const void *stack_addr
-)
-{
-    if (signal == 0) {
-        OFFLOAD_DEBUG_TRACE_1(1,
-                      GET_OFFLOAD_NUMBER(get_timer_data()),
-                      c_offload_init_func,
-                      "Offload function %s, is_empty=%d, #varDescs=%d, "
-                      "#waits=%d, signal=none\n",
-                      name, is_empty, vars_total, num_waits);
-        OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
-                      c_offload_sent_pointer_data,
-                      "#Wait : %d \n", num_waits);
-        OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
-                      c_offload_signal,
-                      "none %d\n", 0);
-    }
-    else {
-        OFFLOAD_DEBUG_TRACE_1(1,
-                      GET_OFFLOAD_NUMBER(get_timer_data()),
-                      c_offload_init_func,
-                      "Offload function %s, is_empty=%d, #varDescs=%d, "
-                      "#waits=%d, signal=%p\n",
-                      name, is_empty, vars_total, num_waits,
-                      *signal);
-
-        OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
-                      c_offload_signal,
-                      "%d\n", signal);
-    }
-    OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
-                      c_offload_wait,
-                      "#Wait : %d  %p\n", num_waits, waits);
-
-    if (m_status != 0) {
-        m_status->result = OFFLOAD_SUCCESS;
-        m_status->device_number = m_device.get_logical_index();
-    }
-
-    m_need_runfunction = !is_empty;
-
-    // wait for dependencies to finish
-    if (!wait_dependencies(waits, num_waits)) {
-        cleanup();
-        return false;
-    }
-
-    // setup buffers
-    if (!setup_descriptors(vars, vars2, vars_total, entry_id, stack_addr)) {
-        cleanup();
-        return false;
-    }
-
-    // initiate send for pointers. Want to do it as early as possible.
-    if (!send_pointer_data(signal != 0)) {
-        cleanup();
-        return false;
-    }
-
-    // setup misc data for run function
-    if (!setup_misc_data(name)) {
-        cleanup();
-        return false;
-    }
-
-    // gather copyin data into buffer
-    if (!gather_copyin_data()) {
-        cleanup();
-        return false;
-    }
-
-    // Start the computation
-    if (!compute()) {
-        cleanup();
-        return false;
-    }
-
-    // initiate receive for pointers
-    if (!receive_pointer_data(signal != 0)) {
-        cleanup();
-        return false;
-    }
-
-    // if there is a signal save descriptor for the later use.
-    if (signal != 0) {
-        m_device.add_signal(*signal, this);
-        return true;
-    }
-
-    // wait for the offload to finish.
-    if (!offload_finish()) {
-        cleanup();
-        return false;
-    }
-
-    cleanup();
-    return true;
-}
-
-bool OffloadDescriptor::offload_finish()
-{
-    COIRESULT res;
-
-    // wait for compute dependencies to become signaled
-    if (m_in_deps_total > 0) {
-        OffloadTimer timer(get_timer_data(), c_offload_host_wait_compute);
-
-        if (__offload_active_wait) {
-            // keep CPU busy
-            do {
-                res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0);
-            }
-            while (res == COI_TIME_OUT_REACHED);
-        }
-        else {
-            res = COI::EventWait(m_in_deps_total, m_in_deps, -1, 1, 0, 0);
-        }
-
-        if (res != COI_SUCCESS) {
-            if (m_status != 0) {
-                m_status->result = translate_coi_error(res);
-                return false;
-            }
-            report_coi_error(c_event_wait, res);
-        }
-    }
-
-    // scatter copyout data received from target
-    if (!scatter_copyout_data()) {
-        return false;
-    }
-    // wait for receive dependencies to become signaled
-    if (m_out_deps_total > 0) {
-        OffloadTimer timer(get_timer_data(), c_offload_host_wait_buffers_reads);
-
-        if (__offload_active_wait) {
-            // keep CPU busy
-            do {
-                res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0);
-            }
-            while (res == COI_TIME_OUT_REACHED);
-        }
-        else {
-            res = COI::EventWait(m_out_deps_total, m_out_deps, -1, 1, 0, 0);
-        }
-
-        if (res != COI_SUCCESS) {
-            if (m_status != 0) {
-                m_status->result = translate_coi_error(res);
-                return false;
-            }
-            report_coi_error(c_event_wait, res);
-        }
-    }
-
-    // destroy buffers
-    {
-        OffloadTimer timer(get_timer_data(), c_offload_host_destroy_buffers);
-
-        for (BufferList::const_iterator it = m_destroy_buffers.begin();
-             it != m_destroy_buffers.end(); it++) {
-            res = COI::BufferDestroy(*it);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_destroy, res);
-            }
-        }
-    }
-
-    return true;
-}
-
-void OffloadDescriptor::cleanup()
-{
-    // release device in orsl
-    ORSL::release(m_device.get_logical_index());
-
-    OFFLOAD_TIMER_STOP(get_timer_data(), c_offload_host_total_offload);
-
-    // report stuff
-    Offload_Report_Epilog(get_timer_data());
-}
-
-bool OffloadDescriptor::is_signaled()
-{
-    bool signaled = true;
-    COIRESULT res;
-
-    // check compute and receive dependencies
-    if (m_in_deps_total > 0) {
-        res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0);
-        signaled = signaled && (res == COI_SUCCESS);
-    }
-    if (m_out_deps_total > 0) {
-        res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0);
-        signaled = signaled && (res == COI_SUCCESS);
-    }
-
-    return signaled;
-}
-
-// Send pointer data if source or destination or both of them are
-// noncontiguous. There is guarantee that length of destination enough for
-// transferred data.
-bool OffloadDescriptor::send_noncontiguous_pointer_data(
-    int i,
-    PtrData* src_data,
-    PtrData* dst_data,
-    COIEVENT *event
-    )
-{
-    int64_t offset_src, offset_dst;
-    int64_t length_src, length_dst;
-    int64_t length_src_cur, length_dst_cur;
-    int64_t send_size, data_sent = 0;
-    COIRESULT res;
-    bool dst_is_empty = true;
-    bool src_is_empty = true;
-
-    // Set length_src and length_dst
-    length_src = (m_vars_extra[i].read_rng_src) ?
-        m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
-    length_dst = !m_vars[i].into ? length_src :
-                     (m_vars_extra[i].read_rng_dst) ?
-                     m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size;
-    send_size = (length_src < length_dst) ? length_src : length_dst;
-
-    // consequently get contiguous ranges,
-    // define corresponded destination offset and send data
-    do {
-        if (src_is_empty) {
-            if (m_vars_extra[i].read_rng_src) {
-                if (!get_next_range(m_vars_extra[i].read_rng_src,
-                         &offset_src)) {
-                    // source ranges are over - nothing to send
-                    break;
-                }
-            }
-            else if (data_sent == 0) {
-                offset_src = m_vars_extra[i].cpu_disp;
-            }
-            else {
-                break;
-            }
-            length_src_cur = length_src;
-        }
-        else {
-            // if source is contiguous or its contiguous range is greater
-            // than destination one
-            offset_src += send_size;
-        }
-        length_src_cur -= send_size;
-        src_is_empty = length_src_cur == 0;
-
-        if (dst_is_empty) {
-            if (m_vars[i].into) {
-                if (m_vars_extra[i].read_rng_dst) {
-                    if (!get_next_range(m_vars_extra[i].read_rng_dst,
-                             &offset_dst)) {
-                        // destination ranges are over
-                        LIBOFFLOAD_ERROR(c_destination_is_over);
-                        return false;
-                    }
-                }
-                // into is contiguous.
-                else {
-                    offset_dst = m_vars[i].disp;
-                }
-                length_dst_cur = length_dst;
-            }
-            // same as source
-            else {
-                offset_dst = offset_src;
-                length_dst_cur = length_src;
-            }
-        }
-        else {
-            // if destination is contiguous or its contiguous range is greater
-            // than source one
-            offset_dst += send_size;
-        }
-        length_dst_cur -= send_size;
-        dst_is_empty = length_dst_cur == 0;
-
-        if (src_data != 0 && src_data->cpu_buf != 0) {
-            res = COI::BufferCopy(
-                dst_data->mic_buf,
-                src_data->cpu_buf,
-                m_vars[i].mic_offset - dst_data->alloc_disp +
-                m_vars[i].offset + offset_dst,
-                m_vars_extra[i].cpu_offset + offset_src,
-                send_size,
-                COI_COPY_UNSPECIFIED,
-                0, 0,
-                event);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_copy, res);
-            }
-        }
-        else {
-            char *base = offload_get_src_base(m_vars[i].ptr,
-                m_vars[i].type.src);
-
-            res = COI::BufferWrite(
-                dst_data->mic_buf,
-                m_vars[i].mic_offset - dst_data->alloc_disp +
-                m_vars[i].offset + offset_dst,
-                base + offset_src,
-                send_size,
-                COI_COPY_UNSPECIFIED,
-                0, 0,
-                event);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_write, res);
-            }
-        }
-        data_sent += length_src;
-    }
-    while (true);
-    return true;
-}
-
-bool OffloadDescriptor::send_pointer_data(bool is_async)
-{
-    OffloadTimer timer(get_timer_data(), c_offload_host_send_pointers);
-
-    uint64_t ptr_sent = 0;
-    COIRESULT res;
-
-    // Initiate send for pointer data
-    for (int i = 0; i < m_vars_total; i++) {
-        switch (m_vars[i].type.dst) {
-            case c_data_ptr_array:
-                break;
-            case c_data:
-            case c_void_ptr:
-            case c_cean_var:
-                if (m_vars[i].direction.in &&
-                    m_vars[i].flags.is_static_dstn) {
-                    COIEVENT *event =
-                        (is_async ||
-                         m_vars[i].size >= __offload_use_async_buffer_write) ?
-                        &m_in_deps[m_in_deps_total++] : 0;
-                    PtrData* dst_data = m_vars[i].into ?
-                                            m_vars_extra[i].dst_data :
-                                            m_vars_extra[i].src_data;
-                    PtrData* src_data =
-                        VAR_TYPE_IS_PTR(m_vars[i].type.src) ||
-                        VAR_TYPE_IS_SCALAR(m_vars[i].type.src) &&
-                        m_vars[i].flags.is_static ?
-                           m_vars_extra[i].src_data : 0;
-
-                    if (m_vars[i].flags.is_noncont_src ||
-                        m_vars[i].flags.is_noncont_dst) {
-                        if (!send_noncontiguous_pointer_data(
-                                i, src_data, dst_data, event)) {
-                            return false;
-                        }
-                    }
-                    else if (src_data != 0 && src_data->cpu_buf != 0) {
-                        res = COI::BufferCopy(
-                            dst_data->mic_buf,
-                            src_data->cpu_buf,
-                            m_vars[i].mic_offset - dst_data->alloc_disp +
-                            m_vars[i].offset + m_vars[i].disp,
-                            m_vars_extra[i].cpu_offset +
-                            m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            0, 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_copy, res);
-                        }
-                    }
-                    else {
-                        char *base = offload_get_src_base(m_vars[i].ptr,
-                                                          m_vars[i].type.src);
-                        res = COI::BufferWrite(
-                            dst_data->mic_buf,
-                            m_vars[i].mic_offset - dst_data->alloc_disp +
-                            m_vars[i].offset + m_vars[i].disp,
-                            base + m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            0, 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_write, res);
-                        }
-                    }
-                    ptr_sent += m_vars[i].size;
-                }
-                break;
-
-            case c_string_ptr:
-            case c_data_ptr:
-            case c_cean_var_ptr:
-            case c_dv_ptr:
-                if (m_vars[i].direction.in && m_vars[i].size > 0) {
-                    COIEVENT *event =
-                        (is_async ||
-                         m_vars[i].size >= __offload_use_async_buffer_write) ?
-                        &m_in_deps[m_in_deps_total++] : 0;
-                    PtrData* dst_data = m_vars[i].into ?
-                                            m_vars_extra[i].dst_data :
-                                            m_vars_extra[i].src_data;
-                    PtrData* src_data =
-                        VAR_TYPE_IS_PTR(m_vars[i].type.src) ||
-                        VAR_TYPE_IS_SCALAR(m_vars[i].type.src) &&
-                        m_vars[i].flags.is_static ?
-                            m_vars_extra[i].src_data : 0;
-
-                    if (m_vars[i].flags.is_noncont_src ||
-                        m_vars[i].flags.is_noncont_dst) {
-                        send_noncontiguous_pointer_data(
-                            i, src_data, dst_data, event);
-                    }
-                    else if (src_data != 0 && src_data->cpu_buf != 0) {
-                        res = COI::BufferCopy(
-                            dst_data->mic_buf,
-                            src_data->cpu_buf,
-                            m_vars[i].mic_offset - dst_data->alloc_disp +
-                            m_vars[i].offset + m_vars[i].disp,
-                            m_vars_extra[i].cpu_offset +
-                            m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            0, 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_copy, res);
-                        }
-                    }
-                    else {
-                        char *base = offload_get_src_base(m_vars[i].ptr,
-                                                          m_vars[i].type.src);
-                        res = COI::BufferWrite(
-                            dst_data->mic_buf,
-                            m_vars[i].mic_offset - dst_data->alloc_disp +
-                            m_vars[i].offset + m_vars[i].disp,
-                            base + m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            0, 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_write, res);
-                        }
-                    }
-
-                    ptr_sent += m_vars[i].size;
-                }
-                break;
-
-            case c_dv_data:
-            case c_dv_ptr_data:
-                if (m_vars[i].direction.in &&
-                    m_vars[i].size > 0) {
-                    PtrData *ptr_data = m_vars[i].into ?
-                                        m_vars_extra[i].dst_data :
-                                        m_vars_extra[i].src_data;
-                    PtrData* src_data = m_vars_extra[i].src_data;
-
-                    COIEVENT *event =
-                        (is_async ||
-                         m_vars[i].size >= __offload_use_async_buffer_write) ?
-                        &m_in_deps[m_in_deps_total++] : 0;
-
-                    if (m_vars[i].flags.is_noncont_src ||
-                        m_vars[i].flags.is_noncont_dst) {
-                        send_noncontiguous_pointer_data(
-                            i, src_data, ptr_data, event);
-                    }
-                    else if (src_data && src_data->cpu_buf != 0) {
-                        res = COI::BufferCopy(
-                            ptr_data->mic_buf,
-                            src_data->cpu_buf,
-                            m_vars[i].offset + ptr_data->mic_offset -
-                            ptr_data->alloc_disp +
-                            m_vars[i].disp,
-                            m_vars_extra[i].cpu_offset +
-                            m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            0, 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_copy, res);
-                        }
-                    }
-                    else {
-                        char *base = offload_get_src_base(m_vars[i].ptr,
-                                                          m_vars[i].type.src);
-                        res = COI::BufferWrite(
-                            ptr_data->mic_buf,
-                            ptr_data->mic_offset - ptr_data->alloc_disp +
-                            m_vars[i].offset + m_vars[i].disp,
-                            base + m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            0, 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_write, res);
-                        }
-                    }
-                    ptr_sent += m_vars[i].size;
-                }
-                break;
-
-            case c_dv_data_slice:
-            case c_dv_ptr_data_slice:
-                if (m_vars[i].direction.in &&
-                    m_vars[i].size > 0) {
-                    PtrData *dst_data = m_vars[i].into ?
-                                        m_vars_extra[i].dst_data :
-                                        m_vars_extra[i].src_data;
-                    PtrData* src_data =
-                        (VAR_TYPE_IS_PTR(m_vars[i].type.src) ||
-                        VAR_TYPE_IS_DV_DATA(m_vars[i].type.src) ||
-                        VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src) ||
-                        VAR_TYPE_IS_SCALAR(m_vars[i].type.src) &&
-                        m_vars[i].flags.is_static) ?
-                            m_vars_extra[i].src_data : 0;
-                    COIEVENT *event =
-                        (is_async ||
-                         m_vars[i].size >= __offload_use_async_buffer_write) ?
-                        &m_in_deps[m_in_deps_total++] : 0;
-                    if (m_vars[i].flags.is_noncont_src ||
-                        m_vars[i].flags.is_noncont_dst) {
-                        send_noncontiguous_pointer_data(
-                            i, src_data, dst_data, event);
-                    }
-                    else if (src_data && src_data->cpu_buf != 0) {
-                        res = COI::BufferCopy(
-                            dst_data->mic_buf,
-                            src_data->cpu_buf,
-                            m_vars[i].offset - dst_data->alloc_disp +
-                            dst_data->mic_offset +
-                            m_vars[i].disp,
-                            m_vars_extra[i].cpu_offset +
-                            m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            0, 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_copy, res);
-                        }
-                    }
-                    else {
-                        char *base = offload_get_src_base(m_vars[i].ptr,
-                                                          m_vars[i].type.src);
-                        res = COI::BufferWrite(
-                            dst_data->mic_buf,
-                            dst_data->mic_offset - dst_data->alloc_disp +
-                            m_vars[i].offset + m_vars[i].disp,
-                            base + m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            0, 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_write, res);
-                        }
-                    }
-
-                    ptr_sent += m_vars[i].size;
-                }
-                break;
-
-            default:
-                break;
-        }
-
-        // alloc field isn't used at target.
-        // We can reuse it for offset of array pointers.
-        if (m_vars_extra[i].is_arr_ptr_el) {
-            m_vars[i].ptr_arr_offset = m_vars_extra[i].ptr_arr_offset;
-        }
-    }
-
-    if (m_status) {
-        m_status->data_sent += ptr_sent;
-    }
-
-    OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), ptr_sent);
-    OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
-                  c_offload_sent_pointer_data,
-                  "Total pointer data sent to target: [%lld] bytes\n",
-                  ptr_sent);
-
-    return true;
-}
-
-bool OffloadDescriptor::gather_copyin_data()
-{
-    OffloadTimer timer(get_timer_data(), c_offload_host_gather_inputs);
-
-    if (m_need_runfunction && m_in_datalen > 0) {
-        COIMAPINSTANCE map_inst;
-        char *data;
-
-        // init marshaller
-        if (m_inout_buf != 0) {
-            OffloadTimer timer_map(get_timer_data(),
-                                   c_offload_host_map_in_data_buffer);
-
-            COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_in_datalen,
-                                           COI_MAP_WRITE_ENTIRE_BUFFER,
-                                           0, 0, 0, &map_inst,
-                                           reinterpret_cast<void**>(&data));
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_map, res);
-            }
-        }
-        else {
-            data = (char*) m_func_desc + m_func_desc->data_offset;
-        }
-
-        // send variable descriptors
-        memcpy(data, m_vars, m_vars_total * sizeof(VarDesc));
-        data += m_vars_total * sizeof(VarDesc);
-
-        // init marshaller
-        m_in.init_buffer(data, m_in_datalen);
-
-        // Gather copy data into buffer
-        for (int i = 0; i < m_vars_total; i++) {
-            bool src_is_for_mic = (m_vars[i].direction.out ||
-                                   m_vars[i].into == NULL);
-            PtrData* ptr_data = src_is_for_mic ?
-                                m_vars_extra[i].src_data :
-                                m_vars_extra[i].dst_data;
-            if (m_vars[i].flags.alloc_disp) {
-                m_in.send_data(&ptr_data->alloc_disp,
-                               sizeof(ptr_data->alloc_disp));
-            }
-
-            // send sink address to the target
-            if (m_vars[i].flags.sink_addr) {
-                m_in.send_data(&ptr_data->mic_addr,
-                               sizeof(ptr_data->mic_addr));
-            }
-
-            switch (m_vars[i].type.dst) {
-                case c_data_ptr_array:
-                    break;
-                case c_data:
-                case c_void_ptr:
-                case c_cean_var:
-                    if (m_vars[i].direction.in &&
-                        !m_vars[i].flags.is_static_dstn) {
-
-                        char *ptr = offload_get_src_base(m_vars[i].ptr,
-                                                         m_vars[i].type.src);
-                        if (m_vars[i].type.dst == c_cean_var) {
-                            // offset and length are derived from the array
-                            // descriptor
-                            int64_t size = m_vars[i].size;
-                            int64_t disp = m_vars[i].disp;
-                            m_in.send_data(reinterpret_cast<char*>(&size),
-                                           sizeof(int64_t));
-                            m_in.send_data(reinterpret_cast<char*>(&disp),
-                                           sizeof(int64_t));
-                        }
-
-                        m_in.send_data(ptr + m_vars_extra[i].cpu_disp,
-                                       m_vars[i].size);
-                    }
-                    break;
-
-                case c_dv:
-                    if (m_vars[i].direction.bits ||
-                        m_vars[i].alloc_if ||
-                        m_vars[i].free_if) {
-                        // send dope vector excluding base
-                        char *ptr = static_cast<char*>(m_vars[i].ptr);
-                        m_in.send_data(ptr + sizeof(uint64_t),
-                                       m_vars[i].size - sizeof(uint64_t));
-                    }
-                    break;
-
-                case c_data_ptr:
-                    // send to target addresses of obsolete
-                    // stacks to be released
-                    if (m_vars[i].flags.is_stack_buf &&
-                        !m_vars[i].direction.bits &&
-                        m_vars[i].alloc_if &&
-                        m_vars[i].size != 0) {
-                        for (PtrDataList::iterator it =
-                            m_destroy_stack.begin();
-                            it != m_destroy_stack.end(); it++) {
-                            PtrData * ptr_data = *it;
-                            m_in.send_data(&(ptr_data->mic_addr),
-                                sizeof(ptr_data->mic_addr));
-                        }
-                    }
-                    break;
-                case c_func_ptr:
-                    if (m_vars[i].direction.in) {
-                        m_in.send_func_ptr(*((const void**) m_vars[i].ptr));
-                    }
-                    break;
-
-                default:
-                    break;
-            }
-        }
-
-        if (m_status) {
-            m_status->data_sent += m_in.get_tfr_size();
-        }
-
-        if (m_func_desc->data_offset == 0) {
-            OffloadTimer timer_unmap(get_timer_data(),
-                                     c_offload_host_unmap_in_data_buffer);
-            COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_unmap, res);
-            }
-        }
-    }
-
-    OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), m_in.get_tfr_size());
-    OFFLOAD_DEBUG_TRACE_1(1,
-                  GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_copyin_data,
-                  "Total copyin data sent to target: [%lld] bytes\n",
-                  m_in.get_tfr_size());
-
-    return true;
-}
-
-bool OffloadDescriptor::compute()
-{
-    OffloadTimer timer(get_timer_data(), c_offload_host_start_compute);
-
-    if (m_need_runfunction) {
-        OFFLOAD_DEBUG_TRACE_1(2, GET_OFFLOAD_NUMBER(get_timer_data()),
-                              c_offload_compute, "Compute task on MIC\n");
-
-        void* misc = m_func_desc;
-        int   misc_len = m_func_desc_size;
-        void* ret = 0;
-        int   ret_len = 0;
-
-        if (m_func_desc->data_offset != 0) {
-            misc_len += m_in_datalen;
-
-            if (m_out_datalen > 0) {
-                ret = (char*) m_func_desc + m_func_desc->data_offset;
-                ret_len = m_out_datalen;
-            }
-        }
-
-        // dispatch task
-        COIRESULT res;
-        COIEVENT event;
-        res = m_device.compute(m_compute_buffers,
-                               misc, misc_len,
-                               ret, ret_len,
-                               m_in_deps_total,
-                               m_in_deps_total > 0 ? m_in_deps : 0,
-                               &event);
-        if (res != COI_SUCCESS) {
-            if (m_status != 0) {
-                m_status->result = translate_coi_error(res);
-                return false;
-            }
-            report_coi_error(c_pipeline_run_func, res);
-        }
-
-        m_in_deps_total = 1;
-        m_in_deps[0] = event;
-    }
-
-    return true;
-}
-
-// receive pointer data if source or destination or both of them are
-// noncontiguous. There is guarantee that length of destination enough for
-// transferred data.
-bool OffloadDescriptor::receive_noncontiguous_pointer_data(
-    int i,
-    char* base,
-    COIBUFFER dst_buf,
-    COIEVENT *event
-)
-{
-    int64_t offset_src, offset_dst;
-    int64_t length_src, length_dst;
-    int64_t length_src_cur, length_dst_cur;
-    int64_t receive_size, data_received = 0;
-    COIRESULT res;
-    bool dst_is_empty = true;
-    bool src_is_empty = true;
-
-    // Set length_src and length_dst
-    length_src = (m_vars_extra[i].read_rng_src) ?
-        m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
-    length_dst = !m_vars[i].into ? length_src :
-                     (m_vars_extra[i].read_rng_dst) ?
-                     m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size;
-    receive_size = (length_src < length_dst) ? length_src : length_dst;
-
-    // consequently get contiguous ranges,
-    // define corresponded destination offset and receive data
-    do {
-        // get sorce offset
-        if (src_is_empty) {
-            if (m_vars_extra[i].read_rng_src) {
-                if (!get_next_range(m_vars_extra[i].read_rng_src,
-                         &offset_src)) {
-                    // source ranges are over - nothing to send
-                    break;
-                }
-            }
-            else if (data_received == 0) {
-                offset_src = 0;
-            }
-            else {
-                break;
-            }
-            length_src_cur = length_src;
-        }
-        else {
-            // if source is contiguous or its contiguous range is greater
-            // than destination one
-            offset_src += receive_size;
-        }
-        length_src_cur -= receive_size;
-        src_is_empty = length_src_cur == 0;
-
-        // get destination offset
-        if (dst_is_empty) {
-            if (m_vars[i].into) {
-                if (m_vars_extra[i].read_rng_dst) {
-                    if (!get_next_range(m_vars_extra[i].read_rng_dst,
-                             &offset_dst)) {
-                        // destination ranges are over
-                        LIBOFFLOAD_ERROR(c_destination_is_over);
-                        return false;
-                    }
-                }
-                // destination is contiguous.
-                else {
-                    offset_dst = m_vars_extra[i].cpu_disp;
-                }
-                length_dst_cur = length_dst;
-            }
-            // same as source
-            else {
-                offset_dst = offset_src;
-                length_dst_cur = length_src;
-            }
-        }
-        else {
-            // if destination is contiguous or its contiguous range is greater
-            // than source one
-            offset_dst += receive_size;
-        }
-        length_dst_cur -= receive_size;
-        dst_is_empty = length_dst_cur == 0;
-
-        if (dst_buf != 0) {
-            res = COI::BufferCopy(
-                dst_buf,
-                m_vars_extra[i].src_data->mic_buf,
-                m_vars_extra[i].cpu_offset + offset_dst,
-                m_vars[i].offset + offset_src +
-                m_vars[i].mic_offset -
-                m_vars_extra[i].src_data->alloc_disp,
-                receive_size,
-                COI_COPY_UNSPECIFIED,
-                m_in_deps_total,
-                m_in_deps_total > 0 ? m_in_deps : 0,
-                event);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_copy, res);
-            }
-        }
-        else {
-            res = COI::BufferRead(
-                m_vars_extra[i].src_data->mic_buf,
-                m_vars[i].offset + offset_src +
-                m_vars[i].mic_offset -
-                m_vars_extra[i].src_data->alloc_disp,
-                base + offset_dst,
-                receive_size,
-                COI_COPY_UNSPECIFIED,
-                m_in_deps_total,
-                m_in_deps_total > 0 ? m_in_deps : 0,
-                event);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_read, res);
-            }
-        }
-        data_received += receive_size;
-    }
-    while (true);
-    return true;
-}
-
-bool OffloadDescriptor::receive_pointer_data(bool is_async)
-{
-    OffloadTimer timer(get_timer_data(), c_offload_host_start_buffers_reads);
-
-    uint64_t ptr_received = 0;
-    COIRESULT res;
-
-    for (int i = 0; i < m_vars_total; i++) {
-        switch (m_vars[i].type.src) {
-            case c_data_ptr_array:
-                break;
-            case c_data:
-            case c_void_ptr:
-            case c_cean_var:
-                if (m_vars[i].direction.out &&
-                    m_vars[i].flags.is_static) {
-                    COIEVENT *event =
-                        (is_async ||
-                         m_in_deps_total > 0 ||
-                         m_vars[i].size >= __offload_use_async_buffer_read) ?
-                        &m_out_deps[m_out_deps_total++] : 0;
-                    PtrData *ptr_data = NULL;
-                    COIBUFFER dst_buf = NULL; // buffer at host
-                    char *base;
-
-                    if (VAR_TYPE_IS_PTR(m_vars[i].type.dst)) {
-                        ptr_data = m_vars[i].into ?
-                                   m_vars_extra[i].dst_data :
-                                   m_vars_extra[i].src_data;
-                    }
-                    else if (VAR_TYPE_IS_SCALAR(m_vars[i].type.dst)) {
-                        if (m_vars[i].flags.is_static_dstn) {
-                            ptr_data = m_vars[i].into ?
-                                       m_vars_extra[i].dst_data :
-                                       m_vars_extra[i].src_data;
-                        }
-                    }
-                    dst_buf = ptr_data ? ptr_data->cpu_buf : NULL;
-                    if (dst_buf == NULL) {
-                        base = offload_get_src_base(
-                            m_vars[i].into ?
-                            static_cast<char*>(m_vars[i].into) :
-                            static_cast<char*>(m_vars[i].ptr),
-                            m_vars[i].type.dst);
-                    }
-
-                    if (m_vars[i].flags.is_noncont_src ||
-                        m_vars[i].flags.is_noncont_dst) {
-                        receive_noncontiguous_pointer_data(
-                            i, base, dst_buf, event);
-                    }
-                    else if (dst_buf != 0) {
-                        res = COI::BufferCopy(
-                            dst_buf,
-                            m_vars_extra[i].src_data->mic_buf,
-                            m_vars_extra[i].cpu_offset +
-                            m_vars_extra[i].cpu_disp,
-                            m_vars[i].offset + m_vars[i].disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            m_in_deps_total,
-                            m_in_deps_total > 0 ? m_in_deps : 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_copy, res);
-                        }
-                    }
-                    else {
-                       res = COI::BufferRead(
-                            m_vars_extra[i].src_data->mic_buf,
-                            m_vars[i].offset + m_vars[i].disp,
-                            base + m_vars_extra[i].cpu_offset +
-                            m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            m_in_deps_total,
-                            m_in_deps_total > 0 ? m_in_deps : 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_read, res);
-                        }
-                    }
-                    ptr_received += m_vars[i].size;
-                }
-                break;
-
-            case c_string_ptr:
-            case c_data_ptr:
-            case c_cean_var_ptr:
-            case c_dv_data:
-            case c_dv_ptr_data:
-            case c_dv_data_slice:
-            case c_dv_ptr_data_slice:
-            case c_dv_ptr: {
-                COIBUFFER dst_buf = NULL; // buffer on host
-                if (m_vars[i].direction.out && m_vars[i].size > 0) {
-                    COIEVENT *event =
-                        (is_async ||
-                         m_in_deps_total > 0 ||
-                         m_vars[i].size >= __offload_use_async_buffer_read) ?
-                        &m_out_deps[m_out_deps_total++] : 0;
-
-                    uint64_t dst_offset = 0;
-                    char *base = static_cast<char*>(m_vars[i].ptr);
-
-                    if (VAR_TYPE_IS_PTR(m_vars[i].type.dst)) {
-                        PtrData *ptr_data = m_vars[i].into ?
-                                            m_vars_extra[i].dst_data :
-                                            m_vars_extra[i].src_data;
-                        dst_buf = ptr_data ? ptr_data->cpu_buf : NULL;
-                        if (dst_buf == NULL) {
-                            base = m_vars[i].into ?
-                                   *static_cast<char**>(m_vars[i].into) :
-                                   *static_cast<char**>(m_vars[i].ptr);
-                        }
-                        dst_offset = m_vars_extra[i].cpu_offset +
-                                     m_vars_extra[i].cpu_disp;
-                    }
-                    else if (VAR_TYPE_IS_SCALAR(m_vars[i].type.dst)) {
-                        if (m_vars[i].flags.is_static_dstn) {
-                            dst_buf = m_vars[i].into ?
-                                        m_vars_extra[i].dst_data->cpu_buf :
-                                        m_vars_extra[i].src_data->cpu_buf;
-                        }
-                        if (dst_buf == NULL) {
-                            base = offload_get_src_base(
-                                m_vars[i].into ?
-                                static_cast<char*>(m_vars[i].into) :
-                                static_cast<char*>(m_vars[i].ptr),
-                                m_vars[i].type.dst);
-                        }
-                        dst_offset = m_vars_extra[i].cpu_offset +
-                                     m_vars_extra[i].cpu_disp;
-                    }
-                    else if (VAR_TYPE_IS_DV_DATA(m_vars[i].type.dst) ||
-                             VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) {
-                        PtrData *ptr_data = m_vars[i].into != 0 ?
-                                            m_vars_extra[i].dst_data :
-                                            m_vars_extra[i].src_data;
-                        dst_buf = ptr_data != 0 ? ptr_data->cpu_buf : 0;
-                        if (dst_buf == NULL) {
-                            base = offload_get_src_base(
-                                m_vars[i].into ?
-                                static_cast<char*>(m_vars[i].into) :
-                                static_cast<char*>(m_vars[i].ptr),
-                                m_vars[i].type.dst);
-
-                        }
-                        dst_offset = m_vars_extra[i].cpu_offset +
-                                     m_vars_extra[i].cpu_disp;
-                    }
-
-                    if (m_vars[i].flags.is_noncont_src ||
-                        m_vars[i].flags.is_noncont_dst) {
-                        receive_noncontiguous_pointer_data(
-                            i, base, dst_buf, event);
-                    }
-                    else if (dst_buf != 0) {
-                        res = COI::BufferCopy(
-                            dst_buf,
-                            m_vars_extra[i].src_data->mic_buf,
-                            dst_offset,
-                            m_vars[i].offset + m_vars[i].disp +
-                                m_vars[i].mic_offset -
-                                m_vars_extra[i].src_data->alloc_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            m_in_deps_total,
-                            m_in_deps_total > 0 ? m_in_deps : 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_copy, res);
-                        }
-                    }
-                    else {
-                        res = COI::BufferRead(
-                            m_vars_extra[i].src_data->mic_buf,
-                            m_vars[i].offset + m_vars[i].disp +
-                                m_vars[i].mic_offset -
-                                m_vars_extra[i].src_data->alloc_disp,
-                            base + dst_offset,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            m_in_deps_total,
-                            m_in_deps_total > 0 ? m_in_deps : 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_read, res);
-                        }
-                    }
-                    ptr_received += m_vars[i].size;
-                }
-                break;
-            }
-
-            default:
-                break;
-        }
-
-        // destroy buffers for obsolete stacks
-        if (m_destroy_stack.size() != 0) {
-            for (PtrDataList::iterator it = m_destroy_stack.begin();
-                it != m_destroy_stack.end(); it++) {
-                PtrData *ptr_data = *it;
-                m_destroy_buffers.push_back(ptr_data->mic_buf);
-                OFFLOAD_TRACE(3, "Removing stack buffer with addr %p\n",
-                                  ptr_data->mic_addr);
-            }
-            m_destroy_stack.clear();
-        }
-        if (m_vars[i].free_if) {
-            // remove association for automatic variables
-            if (m_is_openmp && !m_vars[i].flags.is_static &&
-                (m_vars[i].type.src == c_data ||
-                 m_vars[i].type.src == c_void_ptr ||
-                 m_vars[i].type.src == c_cean_var)) {
-                AutoData *auto_data = m_vars_extra[i].auto_data;
-                if (auto_data != 0 && auto_data->remove_reference() == 0) {
-                    m_device.remove_auto_data(auto_data->cpu_addr.start());
-                }
-            }
-
-            // destroy buffers
-            if (m_vars[i].direction.out || m_vars[i].into == NULL) {
-                if (!VAR_TYPE_IS_PTR(m_vars[i].type.src) &&
-                    !VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src) &&
-                    !VAR_TYPE_IS_DV_DATA(m_vars[i].type.src)) {
-                    continue;
-                }
-
-                PtrData *ptr_data = m_vars_extra[i].src_data;
-                if (ptr_data->remove_reference() == 0) {
-                    // destroy buffers
-                    if (ptr_data->cpu_buf != 0) {
-                        m_destroy_buffers.push_back(ptr_data->cpu_buf);
-                    }
-                    if (ptr_data->mic_buf != 0) {
-                        m_destroy_buffers.push_back(ptr_data->mic_buf);
-                    }
-                    OFFLOAD_TRACE(3, "Removing association for addr %p\n",
-                                  ptr_data->cpu_addr.start());
-
-                    // remove association from map
-                    m_device.remove_ptr_data(ptr_data->cpu_addr.start());
-                }
-            }
-            else if (VAR_TYPE_IS_PTR(m_vars[i].type.dst) ||
-                     VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst) ||
-                     VAR_TYPE_IS_DV_DATA(m_vars[i].type.dst)) {
-                PtrData *ptr_data = m_vars_extra[i].dst_data;
-                if (ptr_data->remove_reference() == 0) {
-                    // destroy buffers
-                    if (ptr_data->cpu_buf != 0) {
-                        m_destroy_buffers.push_back(ptr_data->cpu_buf);
-                    }
-                    if (ptr_data->mic_buf != 0) {
-                        m_destroy_buffers.push_back(ptr_data->mic_buf);
-                    }
-                    OFFLOAD_TRACE(3, "Removing association for addr %p\n",
-                                  ptr_data->cpu_addr.start());
-
-                    // remove association from map
-                    m_device.remove_ptr_data(ptr_data->cpu_addr.start());
-                }
-            }
-        }
-    }
-
-    if (m_status) {
-        m_status->data_received += ptr_received;
-    }
-
-    OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), ptr_received);
-    OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
-                  c_offload_received_pointer_data,
-                  "Total pointer data received from target: [%lld] bytes\n",
-                  ptr_received);
-
-    return true;
-}
-
-bool OffloadDescriptor::scatter_copyout_data()
-{
-    OffloadTimer timer(get_timer_data(), c_offload_host_scatter_outputs);
-
-    if (m_need_runfunction && m_out_datalen > 0) {
-
-        // total size that need to be transferred from target to host
-        COIMAPINSTANCE map_inst;
-        COIRESULT res;
-        char *data;
-
-        // output data buffer
-        if (m_func_desc->data_offset == 0) {
-            OffloadTimer timer_map(get_timer_data(),
-                                   c_offload_host_map_out_data_buffer);
-
-            COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_out_datalen,
-                                           COI_MAP_READ_ONLY, 0, 0, 0,
-                                           &map_inst,
-                                            reinterpret_cast<void**>(&data));
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_map, res);
-            }
-        }
-        else {
-            data = (char*) m_func_desc + m_func_desc->data_offset;
-        }
-
-        // get timing data
-        OFFLOAD_TIMER_TARGET_DATA(get_timer_data(), data);
-        data += OFFLOAD_TIMER_DATALEN();
-
-        // initialize output marshaller
-        m_out.init_buffer(data, m_out_datalen);
-
-        for (int i = 0; i < m_vars_total; i++) {
-            switch (m_vars[i].type.src) {
-                case c_data_ptr_array:
-                    break;
-                case c_data:
-                case c_void_ptr:
-                case c_cean_var:
-                    if (m_vars[i].direction.out &&
-                        !m_vars[i].flags.is_static) {
-
-                        if (m_vars[i].into) {
-                            char *ptr = offload_get_src_base(
-                                static_cast<char*>(m_vars[i].into),
-                                m_vars[i].type.dst);
-                            m_out.receive_data(ptr + m_vars_extra[i].cpu_disp,
-                                               m_vars[i].size);
-                        }
-                        else {
-                            m_out.receive_data(
-                                static_cast<char*>(m_vars[i].ptr) +
-                                    m_vars_extra[i].cpu_disp,
-                                m_vars[i].size);
-                        }
-                    }
-                    break;
-
-                case c_func_ptr:
-                    if (m_vars[i].direction.out) {
-                        m_out.receive_func_ptr((const void**) m_vars[i].ptr);
-                    }
-                    break;
-
-                default:
-                    break;
-            }
-        }
-
-        if (m_status) {
-            m_status->data_received += m_out.get_tfr_size();
-        }
-
-        if (m_func_desc->data_offset == 0) {
-            OffloadTimer timer_unmap(get_timer_data(),
-                                     c_offload_host_unmap_out_data_buffer);
-
-            COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_unmap, res);
-            }
-        }
-    }
-
-    OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), m_out.get_tfr_size());
-    OFFLOAD_TRACE(1, "Total copyout data received from target: [%lld] bytes\n",
-                  m_out.get_tfr_size());
-
-    return true;
-}
-
-void get_arr_desc_numbers(
-    const arr_desc *ap,
-    int64_t el_size,
-    int64_t &offset,
-    int64_t &size,
-    int     &el_number,
-    CeanReadRanges* &ptr_ranges
-)
-{
-    if (is_arr_desc_contiguous(ap)) {
-        ptr_ranges = NULL;
-        __arr_data_offset_and_length(ap, offset, size);
-        el_number = size / el_size;
-    }
-    else {
-        ptr_ranges = init_read_ranges_arr_desc(ap);
-        el_number = (ptr_ranges->range_size / el_size) *
-                    ptr_ranges->range_max_number;
-        size = ptr_ranges->range_size;
-    }
-}
-
-arr_desc * make_arr_desc(
-    void*   ptr_val,
-    int64_t extent_start_val,
-    int64_t extent_elements_val,
-    int64_t size
-)
-{
-    arr_desc *res;
-    res = (arr_desc *)malloc(sizeof(arr_desc));
-    res->base = reinterpret_cast<int64_t>(ptr_val);
-    res->rank = 1;
-    res->dim[0].size = size;
-    res->dim[0].lindex = 0;
-    res->dim[0].lower = extent_start_val;
-    res->dim[0].upper = extent_elements_val + extent_start_val - 1;
-    res->dim[0].stride = 1;
-    return res;
-}
-
-bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
-{
-    int             pointers_number;
-    int             tmp_val;
-    int             new_index = m_vars_total;
-    const arr_desc *ap;
-    const VarDesc3 *vd3 = static_cast<const VarDesc3*>(m_vars[i].ptr);
-    int             flags = vd3->array_fields;
-    bool            src_is_for_mic = (m_vars[i].direction.out ||
-                                      m_vars[i].into == NULL);
-
-    ReadArrElements<void *>  ptr;
-    ReadArrElements<void *>  into;
-    ReadArrElements<int64_t> ext_start;
-    ReadArrElements<int64_t> ext_elements;
-    ReadArrElements<int64_t> align;
-    ReadArrElements<int64_t> alloc_if;
-    ReadArrElements<int64_t> free_if;
-    ReadArrElements<int64_t> into_start;
-    ReadArrElements<int64_t> into_elem;
-    ReadArrElements<int64_t> alloc_start;
-    ReadArrElements<int64_t> alloc_elem;
-
-
-    ap = static_cast<const arr_desc*>(vd3->ptr_array);
-
-    // "pointers_number" for total number of transferred pointers.
-    // For each of them we create new var_desc and put it at the bottom
-    // of the var_desc's array
-    get_arr_desc_numbers(ap, sizeof(void *), ptr.offset, ptr.size,
-        pointers_number, ptr.ranges);
-    ptr.base = reinterpret_cast<char*>(ap->base);
-
-    // 2. prepare memory for new var_descs
-    m_vars_total += pointers_number;
-    m_vars       = (VarDesc*)realloc(m_vars, m_vars_total * sizeof(VarDesc));
-    m_vars_extra =
-        (VarExtra*)realloc(m_vars_extra, m_vars_total * sizeof(VarExtra));
-    m_in_deps    =
-        (COIEVENT*)realloc(m_in_deps, sizeof(COIEVENT) * (m_vars_total + 1));
-    m_out_deps   =
-        (COIEVENT*)realloc(m_out_deps, sizeof(COIEVENT) * m_vars_total);
-
-    // 3. Prepare for reading new var_desc's fields
-    //    EXTENT START
-    if ((flags & (1<<flag_extent_start_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->extent_start);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, ext_start.offset,
-            ext_start.size, tmp_val, ext_start.ranges);
-        ext_start.base = reinterpret_cast<char*>(ap->base);
-        ext_start.el_size = ap->dim[ap->rank - 1].size;
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start");
-            return false;
-        }
-    }
-    else if ((flags & (1<<flag_extent_start_is_scalar)) != 0) {
-        ext_start.val = (int64_t)vd3->extent_start;
-    }
-    else {
-        ext_start.val = 0;
-    }
-
-    //    EXTENT ELEMENTS NUMBER
-    if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->extent_elements);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
-            ext_elements.offset, ext_elements.size,
-            tmp_val, ext_elements.ranges);
-        ext_elements.base = reinterpret_cast<char*>(ap->base);
-        ext_elements.el_size = ap->dim[ap->rank - 1].size;
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements");
-            return false;
-        }
-    }
-    else if ((flags & (1<<flag_extent_elements_is_scalar)) != 0) {
-        ext_elements.val = (int64_t)vd3->extent_elements;
-    }
-    else {
-        ext_elements.val = m_vars[i].count;
-    }
-
-    //    ALLOC_IF
-    if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->alloc_if_array);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_if.offset,
-            alloc_if.size, tmp_val, alloc_if.ranges);
-        alloc_if.base = reinterpret_cast<char*>(ap->base);
-        alloc_if.el_size = ap->dim[ap->rank - 1].size;
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if");
-            return false;
-        }
-    }
-    else {
-        alloc_if.val = m_vars[i].count;
-    }
-
-    //    FREE_IF
-    if ((flags & (1<<flag_free_if_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->free_if_array);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, free_if.offset,
-            free_if.size, tmp_val, free_if.ranges);
-        free_if.base = reinterpret_cast<char*>(ap->base);
-        free_if.el_size = ap->dim[ap->rank - 1].size;
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if");
-            return false;
-        }
-    }
-    else {
-        free_if.val = m_vars[i].count;
-    }
-
-    //    ALIGN
-
-    if ((flags & (1<<flag_align_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->align_array);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, align.offset,
-            align.size, tmp_val, align.ranges);
-        align.base = reinterpret_cast<char*>(ap->base);
-        align.el_size = ap->dim[ap->rank - 1].size;
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align");
-            return false;
-        }
-    }
-    else {
-        align.val = m_vars[i].align;
-    }
-
-    // 3.1 INTO
-
-    if (m_vars[i].into) {
-        ap = static_cast<const arr_desc*>(m_vars[i].into);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into.offset,
-            into.size, tmp_val, into.ranges);
-        into.base = reinterpret_cast<char*>(ap->base);
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into");
-            return false;
-        }
-    }
-
-    // 3.2 INTO_START
-
-    if ((flags & (1<<flag_into_start_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->into_start);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_start.offset,
-            into_start.size, tmp_val, into_start.ranges);
-        into_start.base = reinterpret_cast<char*>(ap->base);
-        into_start.el_size = ap->dim[ap->rank - 1].size;
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start");
-            return false;
-        }
-    }
-    else if ((flags & (1<<flag_into_start_is_scalar)) != 0) {
-        into_start.val = (int64_t)vd3->into_start;
-    }
-    else {
-        into_start.val = 0;
-    }
-
-    // 3.3 INTO_ELEMENTS
-
-    if ((flags & (1<<flag_into_elements_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->into_elements);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_elem.offset,
-            into_elem.size, tmp_val, into_elem.ranges);
-        into_elem.base = reinterpret_cast<char*>(ap->base);
-        into_elem.el_size = ap->dim[ap->rank - 1].size;
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements");
-            return false;
-        }
-    }
-    else if ((flags & (1<<flag_into_elements_is_scalar)) != 0) {
-        into_elem.val = (int64_t)vd3->into_elements;
-    }
-    else {
-        into_elem.val = m_vars[i].count;
-    }
-
-    //    alloc_start
-
-    if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->alloc_start);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
-            alloc_start.offset, alloc_start.size, tmp_val,
-            alloc_start.ranges);
-        alloc_start.base = reinterpret_cast<char*>(ap->base);
-        alloc_start.el_size = ap->dim[ap->rank - 1].size;
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start");
-            return false;
-        }
-    }
-    else if ((flags & (1<<flag_alloc_start_is_scalar)) != 0) {
-        alloc_start.val = (int64_t)vd3->alloc_start;
-    }
-    else {
-        alloc_start.val = 0;
-    }
-
-    //    alloc_elem
-
-    if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->alloc_elements);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_elem.offset,
-            alloc_elem.size, tmp_val, alloc_elem.ranges);
-        alloc_elem.base = reinterpret_cast<char*>(ap->base);
-        alloc_elem.el_size = ap->dim[ap->rank - 1].size;
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch,
-                             "alloc_extent elements");
-            return false;
-        }
-    }
-    else if ((flags & (1<<flag_alloc_elements_is_scalar)) != 0) {
-        alloc_elem.val = (int64_t)vd3->alloc_elements;
-    }
-    else {
-        alloc_elem.val = 0;
-    }
-
-    for (int k = 0; k < pointers_number; k++) {
-        int type = flags & 0x3f;
-        int type_src, type_dst;
-        //  Get new values
-        // type_src, type_dst
-        type_src = type_dst = (type == c_data_ptr_array) ?
-                              c_data_ptr   : (type == c_func_ptr_array) ?
-                              c_func_ptr   : (type == c_void_ptr_array) ?
-                              c_void_ptr   : (type == c_string_ptr_array) ?
-                              c_string_ptr : 0;
-
-        // Get ptr val
-        if (!ptr.read_next(true)) {
-            break;
-        }
-        else {
-            ptr.val = (void*)(ptr.base + ptr.offset);
-        }
-
-        // !!! If we got error at phase of reading - it's an internal
-        // !!! error, as we must detect mismatch before
-
-        // Get into val
-        if (m_vars[i].into) {
-            if (!into.read_next(true)) {
-                LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into");
-                LIBOFFLOAD_ABORT;
-            }
-            else {
-                into.val = (void*)(into.base + into.offset);
-            }
-        }
-
-        // Get other components of the clause
-        if (!ext_start.read_next(flags & (1<<flag_extent_start_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start");
-            LIBOFFLOAD_ABORT;
-        }
-        if (!ext_elements.read_next(
-                flags & (1<<flag_extent_elements_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements");
-            LIBOFFLOAD_ABORT;
-        }
-        if (!alloc_if.read_next(flags & (1<<flag_alloc_if_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if");
-            LIBOFFLOAD_ABORT;
-        }
-        if (!free_if.read_next(flags & (1<<flag_free_if_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if");
-            LIBOFFLOAD_ABORT;
-        }
-        if (!align.read_next(flags & (1<<flag_align_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align");
-            LIBOFFLOAD_ABORT;
-        }
-        if (!into_start.read_next(flags & (1<<flag_into_start_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start");
-            LIBOFFLOAD_ABORT;
-        }
-        if (!into_elem.read_next(flags & (1<<flag_into_elements_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements");
-            LIBOFFLOAD_ABORT;
-        }
-        if (!alloc_start.read_next(flags & (1<<flag_alloc_start_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start");
-            LIBOFFLOAD_ABORT;
-        }
-        if (!alloc_elem.read_next(
-                 flags & (1<<flag_alloc_elements_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent elements");
-            LIBOFFLOAD_ABORT;
-        }
-
-        m_vars[new_index + k].direction.bits = m_vars[i].direction.bits;
-        m_vars[new_index + k].alloc_if = alloc_if.val;
-        m_vars[new_index + k].free_if = free_if.val;
-        m_vars[new_index + k].align = align.val;
-        m_vars[new_index + k].mic_offset = 0;
-        m_vars[new_index + k].flags.bits = m_vars[i].flags.bits;
-        m_vars[new_index + k].offset = 0;
-        m_vars[new_index + k].size = m_vars[i].size;
-
-        if (ext_start.val == 0) {
-            m_vars[new_index + k].count = ext_elements.val;
-            m_vars[new_index + k].ptr = ptr.val;
-            if (type_src == c_string_ptr) {
-                m_vars[new_index + k].size = 0;
-            }
-        }
-        else {
-            m_vars[new_index + k].count = 0;
-            m_vars[new_index + k].ptr =
-                static_cast<void*>(make_arr_desc(
-                ptr.val,
-                ext_start.val,
-                ext_elements.val,
-                m_vars[i].size));
-
-            type_src = type_src == c_data_ptr ? c_cean_var_ptr :
-                                   c_string_ptr ? c_cean_var_ptr :
-                                   type_src;
-            if (!m_vars[i].into) {
-                type_dst = type_src;
-            }
-        }
-
-        if (m_vars[i].into && into_elem.val != 0) {
-            m_vars[new_index + k].into =
-                static_cast<void*>(make_arr_desc(
-                into.val,
-                into_start.val,
-                into_elem.val,
-                m_vars[i].size));
-            type_dst = (type == c_data_ptr_array) ? c_cean_var_ptr :
-                       (type == c_string_ptr_array) ? c_cean_var_ptr :
-                        type_src;
-        }
-        else {
-            m_vars[new_index + k].into = NULL;
-        }
-
-        if (alloc_elem.val != 0) {
-            m_vars[new_index + k].alloc =
-                static_cast<void*>(make_arr_desc(
-                ptr.val,
-                alloc_start.val,
-                alloc_elem.val,
-                m_vars[i].size));
-        }
-        else {
-            m_vars[new_index + k].alloc = NULL;
-        }
-
-        m_vars[new_index + k].type.src = type_src;
-        m_vars[new_index + k].type.dst = type_dst;
-
-        m_vars_extra[new_index + k].is_arr_ptr_el = 1;
-        m_vars_extra[new_index + k].ptr_arr_offset =
-            src_is_for_mic ? ptr.offset : into.offset;
-    }
-    // count and alloc fields are useless at target. They can be reused
-    // for pointer arrays.
-    m_vars[i].count = pointers_number;
-    m_vars[i].ptr_arr_offset = new_index;
-    return true;
-}
-
-static void __offload_fini_library(void)
-{
-    OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ...\n");
-    if (mic_engines_total > 0) {
-        delete[] mic_engines;
-
-        if (mic_proxy_fs_root != 0) {
-            free(mic_proxy_fs_root);
-            mic_proxy_fs_root = 0;
-        }
-
-        if (mic_library_path != 0) {
-            free(mic_library_path);
-            mic_library_path = 0;
-        }
-
-        // destroy thread key
-        thread_key_delete(mic_thread_key);
-    }
-
-    // unload COI library
-    if (COI::is_available) {
-        COI::fini();
-    }
-
-    OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ... done\n");
-}
-
-static void __offload_init_library_once(void)
-{
-    COIRESULT res;
-    uint32_t num_devices;
-    std::bitset<MIC_ENGINES_MAX> devices;
-
-    prefix = report_get_message_str(c_report_host);
-
-    // initialize trace
-    const char *env_var = getenv(htrace_envname);
-    if (env_var != 0 && *env_var != '\0') {
-        int64_t new_val;
-        if (__offload_parse_int_string(env_var, new_val)) {
-            console_enabled = new_val & 0x0f;
-        }
-    }
-
-    env_var = getenv(offload_report_envname);
-    if (env_var != 0 && *env_var != '\0') {
-        int64_t env_val;
-        if (__offload_parse_int_string(env_var, env_val)) {
-            if (env_val == OFFLOAD_REPORT_1 ||
-                env_val == OFFLOAD_REPORT_2 ||
-                env_val == OFFLOAD_REPORT_3) {
-                offload_report_level = env_val;
-            }
-            else {
-                LIBOFFLOAD_ERROR(c_invalid_env_report_value,
-                                 offload_report_envname);
-            }
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
-                             offload_report_envname);
-        }
-    }
-    else if (!offload_report_level) {
-        env_var = getenv(timer_envname);
-        if (env_var != 0 && *env_var != '\0') {
-            timer_enabled = atoi(env_var);
-        }
-    }
-
-    // initialize COI
-    if (!COI::init()) {
-        return;
-    }
-
-    // get number of devices installed in the system
-    res = COI::EngineGetCount(COI_ISA_KNC, &num_devices);
-    if (res != COI_SUCCESS) {
-        return;
-    }
-
-    if (num_devices > MIC_ENGINES_MAX) {
-        num_devices = MIC_ENGINES_MAX;
-    }
-
-    // fill in the list of devices that can be used for offloading
-    env_var = getenv("OFFLOAD_DEVICES");
-    if (env_var != 0) {
-        if (strcasecmp(env_var, "none") != 0) {
-            // value is composed of comma separated physical device indexes
-            char *buf = strdup(env_var);
-            char *str, *ptr;
-            for (str = strtok_r(buf, ",", &ptr); str != 0;
-                 str = strtok_r(0, ",", &ptr)) {
-                // convert string to an int
-                int64_t num;
-                if (!__offload_parse_int_string(str, num)) {
-                    LIBOFFLOAD_ERROR(c_mic_init5);
-
-                    // fallback to using all installed devices
-                    devices.reset();
-                    for (int i = 0; i < num_devices; i++) {
-                        devices.set(i);
-                    }
-                    break;
-                }
-                if (num < 0 || num >= num_devices) {
-                    LIBOFFLOAD_ERROR(c_mic_init6, num);
-                    continue;
-                }
-                devices.set(num);
-            }
-            free(buf);
-        }
-    }
-    else {
-        // use all available devices
-        for (int i = 0; i < num_devices; i++) {
-            COIENGINE engine;
-            res = COI::EngineGetHandle(COI_ISA_KNC, i, &engine);
-            if (res == COI_SUCCESS) {
-                devices.set(i);
-            }
-        }
-    }
-
-    mic_engines_total = devices.count();
-
-    // no need to continue if there are no devices to offload to
-    if (mic_engines_total <= 0) {
-        return;
-    }
-
-    // initialize indexes for available devices
-    mic_engines = new Engine[mic_engines_total];
-    for (int p_idx = 0, l_idx = 0; p_idx < num_devices; p_idx++) {
-        if (devices[p_idx]) {
-            mic_engines[l_idx].set_indexes(l_idx, p_idx);
-            l_idx++;
-        }
-    }
-
-    // library search path for device binaries
-    env_var = getenv("MIC_LD_LIBRARY_PATH");
-    if (env_var != 0) {
-        mic_library_path = strdup(env_var);
-    }
-
-    // memory size reserved for COI buffers
-    env_var = getenv("MIC_BUFFERSIZE");
-    if (env_var != 0) {
-        uint64_t new_size;
-        if (__offload_parse_size_string(env_var, new_size)) {
-            mic_buffer_size = new_size;
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_BUFFERSIZE");
-        }
-    }
-
-    // determine stacksize for the pipeline on the device
-    env_var = getenv("MIC_STACKSIZE");
-    if (env_var != 0 && *env_var != '\0') {
-        uint64_t new_size;
-        if (__offload_parse_size_string(env_var, new_size) &&
-            (new_size >= 16384) && ((new_size & 4095) == 0)) {
-            mic_stack_size = new_size;
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_mic_init3);
-        }
-    }
-
-    // proxy I/O
-    env_var = getenv("MIC_PROXY_IO");
-    if (env_var != 0 && *env_var != '\0') {
-        int64_t new_val;
-        if (__offload_parse_int_string(env_var, new_val)) {
-            mic_proxy_io = new_val;
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_invalid_env_var_int_value, "MIC_PROXY_IO");
-        }
-    }
-    env_var = getenv("MIC_PROXY_FS_ROOT");
-    if (env_var != 0 && *env_var != '\0') {
-        mic_proxy_fs_root = strdup(env_var);
-    }
-
-    // Prepare environment for the target process using the following
-    // rules
-    // - If MIC_ENV_PREFIX is set then any environment variable on the
-    //   host which has that prefix are copied to the device without
-    //   the prefix.
-    //   All other host environment variables are ignored.
-    // - If MIC_ENV_PREFIX is not set or if MIC_ENV_PREFIX="" then host
-    //   environment is duplicated.
-    env_var = getenv("MIC_ENV_PREFIX");
-    if (env_var != 0 && *env_var != '\0') {
-        mic_env_vars.set_prefix(env_var);
-
-        int len = strlen(env_var);
-        for (int i = 0; environ[i] != 0; i++) {
-            if (strncmp(environ[i], env_var, len) == 0 &&
-                strncmp(environ[i], "MIC_LD_LIBRARY_PATH", 19) != 0 &&
-                environ[i][len] != '=') {
-                mic_env_vars.analyze_env_var(environ[i]);
-            }
-        }
-    }
-
-    // create key for thread data
-    if (thread_key_create(&mic_thread_key, Engine::destroy_thread_data)) {
-        LIBOFFLOAD_ERROR(c_mic_init4, errno);
-        return;
-    }
-
-    // cpu frequency
-    cpu_frequency = COI::PerfGetCycleFrequency();
-
-    env_var = getenv(mic_use_2mb_buffers_envname);
-    if (env_var != 0 && *env_var != '\0') {
-        uint64_t new_size;
-        if (__offload_parse_size_string(env_var, new_size)) {
-            __offload_use_2mb_buffers = new_size;
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_invalid_env_var_value,
-                             mic_use_2mb_buffers_envname);
-        }
-    }
-
-    env_var = getenv(mic_use_async_buffer_write_envname);
-    if (env_var != 0 && *env_var != '\0') {
-        uint64_t new_size;
-        if (__offload_parse_size_string(env_var, new_size)) {
-            __offload_use_async_buffer_write = new_size;
-        }
-    }
-
-    env_var = getenv(mic_use_async_buffer_read_envname);
-    if (env_var != 0 && *env_var != '\0') {
-        uint64_t new_size;
-        if (__offload_parse_size_string(env_var, new_size)) {
-            __offload_use_async_buffer_read = new_size;
-        }
-    }
-
-    // mic initialization type
-    env_var = getenv(offload_init_envname);
-    if (env_var != 0 && *env_var != '\0') {
-        if (strcmp(env_var, "on_offload") == 0) {
-            __offload_init_type = c_init_on_offload;
-        }
-        else if (strcmp(env_var, "on_offload_all") == 0) {
-            __offload_init_type = c_init_on_offload_all;
-        }
-#ifndef TARGET_WINNT
-        else if (strcmp(env_var, "on_start") == 0) {
-            __offload_init_type = c_init_on_start;
-        }
-#endif // TARGET_WINNT
-        else {
-            LIBOFFLOAD_ERROR(c_invalid_env_var_value, offload_init_envname);
-        }
-    }
-
-    // active wait
-    env_var = getenv(offload_active_wait_envname);
-    if (env_var != 0 && *env_var != '\0') {
-        int64_t new_val;
-        if (__offload_parse_int_string(env_var, new_val)) {
-            __offload_active_wait = new_val;
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
-                             offload_active_wait_envname);
-        }
-    }
-
-    // omp device num
-    env_var = getenv(omp_device_num_envname);
-    if (env_var != 0 && *env_var != '\0') {
-        int64_t new_val;
-        if (__offload_parse_int_string(env_var, new_val) && new_val >= 0) {
-            __omp_device_num = new_val;
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_omp_invalid_device_num_env,
-                             omp_device_num_envname);
-        }
-    }
-
-    // init ORSL
-    ORSL::init();
-}
-
-extern int __offload_init_library(void)
-{
-    // do one time intialization
-    static OffloadOnceControl ctrl = OFFLOAD_ONCE_CONTROL_INIT;
-    __offload_run_once(&ctrl, __offload_init_library_once);
-
-    // offload is available if COI is available and the number of devices > 0
-    bool is_available = COI::is_available && (mic_engines_total > 0);
-
-    // register pending libraries if there are any
-    if (is_available && __target_libs) {
-        mutex_locker_t locker(__target_libs_lock);
-
-        for (TargetImageList::iterator it = __target_libs_list.begin();
-             it != __target_libs_list.end(); it++) {
-            // Register library in COI
-            COI::ProcessRegisterLibraries(1, &it->data, &it->size,
-                                          &it->origin, &it->offset);
-
-            // add lib to all engines
-            for (int i = 0; i < mic_engines_total; i++) {
-                mic_engines[i].add_lib(*it);
-            }
-        }
-
-        __target_libs = false;
-        __target_libs_list.clear();
-    }
-
-    return is_available;
-}
-
-extern "C" void __offload_register_image(const void *target_image)
-{
-    const struct Image *image = static_cast<const struct Image*>(target_image);
-
-    // decode image
-    const char *name = image->data;
-    const void *data = image->data + strlen(image->data) + 1;
-    uint64_t    size = image->size;
-    const char *origin = 0;
-    uint64_t    offset = 0;
-
-    // our actions depend on the image type
-    const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
-    switch (hdr->e_type) {
-        case ET_EXEC:
-            // Each offload application is supposed to have only one target
-            // image representing target executable.
-            // No thread synchronization is required here as the initialization
-            // code is always executed in a single thread.
-            if (__target_exe != 0) {
-                LIBOFFLOAD_ERROR(c_multiple_target_exes);
-                exit(1);
-            }
-            __target_exe = new TargetImage(name, data, size, origin, offset);
-
-            // Registration code for execs is always called from the context
-            // of main and thus we can safely call any function here,
-            // including LoadLibrary API on windows. This is the place where
-            // we do the offload library initialization.
-            if (__offload_init_library()) {
-                // initialize engine if init_type is on_start
-                if (__offload_init_type == c_init_on_start) {
-                    for (int i = 0; i < mic_engines_total; i++) {
-                        mic_engines[i].init();
-                    }
-                }
-            }
-            break;
-
-        case ET_DYN:
-            // Registration code for libraries is called from the DllMain
-            // context (on windows) and thus we cannot do anything useful
-            // here. So we just add it to the list of pending libraries for
-            // the later use.
-            __target_libs_lock.lock();
-            __target_libs = true;
-            __target_libs_list.push_back(TargetImage(name, data, size,
-                                                     origin, offset));
-            __target_libs_lock.unlock();
-            break;
-
-        default:
-            // something is definitely wrong, issue an error and exit
-            LIBOFFLOAD_ERROR(c_unknown_binary_type);
-            exit(1);
-    }
-}
-
-extern "C" void __offload_unregister_image(const void *target_image)
-{
-    // Target image is packed as follows:
-    //      8 bytes                - size of the target binary
-    //      null-terminated string - binary name
-    //      <size> bytes           - binary contents
-    const struct Image {
-         int64_t size;
-         char data[];
-    } *image = static_cast<const struct Image*>(target_image);
-
-    // decode image
-    const char *name = image->data;
-    const void *data = image->data + strlen(image->data) + 1;
-
-    // our actions depend on the image type
-    const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
-    if (hdr->e_type == ET_EXEC) {
-        // We are executing exec's desctructors.
-        // It is time to do a library cleanup.
-        if (timer_enabled) {
-            Offload_Timer_Print();
-        }
-
-#ifdef MYO_SUPPORT
-        __offload_myoFini();
-#endif // MYO_SUPPORT
-
-        __offload_fini_library();
-    }
-}
-
-// Runtime trace interface for user programs
-
-void __offload_console_trace(int level)
-{
-    console_enabled = level;
-}
-
-// User-visible offload API
-
-int _Offload_number_of_devices(void)
-{
-    __offload_init_library();
-    return mic_engines_total;
-}
-
-int _Offload_get_device_number(void)
-{
-    return -1;
-}
-
-int _Offload_get_physical_device_number(void)
-{
-    return -1;
-}
-
-int _Offload_signaled(int index, void *signal)
-{
-    __offload_init_library();
-
-    // check index value
-    if (index < 0 || mic_engines_total <= 0) {
-        LIBOFFLOAD_ERROR(c_offload_signaled1, index);
-        LIBOFFLOAD_ABORT;
-    }
-
-    // find associated async task
-    OffloadDescriptor *task =
-        mic_engines[index % mic_engines_total].find_signal(signal, false);
-    if (task == 0) {
-        LIBOFFLOAD_ERROR(c_offload_signaled2, signal);
-        LIBOFFLOAD_ABORT;
-    }
-
-    return task->is_signaled();
-}
-
-void _Offload_report(int val)
-{
-    if (val == OFFLOAD_REPORT_ON ||
-        val == OFFLOAD_REPORT_OFF) {
-        offload_report_enabled = val;
-    }
-}
-
-// IDB support
-int   __dbg_is_attached = 0;
-int   __dbg_target_id = -1;
-pid_t __dbg_target_so_pid = -1;
-char  __dbg_target_exe_name[MAX_TARGET_NAME] = {0};
-const int __dbg_api_major_version = 1;
-const int __dbg_api_minor_version = 0;
-
-void __dbg_target_so_loaded()
-{
-}
-void __dbg_target_so_unloaded()
-{
-}
diff --git a/offload/src/offload_host.h b/offload/src/offload_host.h
deleted file mode 100644
index ea23996..0000000
--- a/offload/src/offload_host.h
+++ /dev/null
@@ -1,343 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-/*! \file
-    \brief The parts of the runtime library used only on the host
-*/
-
-#ifndef OFFLOAD_HOST_H_INCLUDED
-#define OFFLOAD_HOST_H_INCLUDED
-
-#ifndef TARGET_WINNT
-#include <unistd.h>
-#endif // TARGET_WINNT
-#include "offload_common.h"
-#include "offload_util.h"
-#include "offload_engine.h"
-#include "offload_env.h"
-#include "offload_orsl.h"
-#include "coi/coi_client.h"
-
-// MIC engines.
-extern Engine*  mic_engines;
-extern uint32_t mic_engines_total;
-
-//! The target image is packed as follows.
-/*!      1. 8 bytes containing the size of the target binary          */
-/*!      2. a null-terminated string which is the binary name         */
-/*!      3. <size> number of bytes that are the contents of the image */
-/*!      The address of symbol __offload_target_image
-             is the address of this structure.                        */
-struct Image {
-     int64_t size; //!< Size in bytes of the target binary name and contents
-     char data[];  //!< The name and contents of the target image
-};
-
-// The offload descriptor.
-class OffloadDescriptor
-{
-public:
-    OffloadDescriptor(
-        int index,
-        _Offload_status *status,
-        bool is_mandatory,
-        bool is_openmp,
-        OffloadHostTimerData * timer_data
-    ) :
-        m_device(mic_engines[index % mic_engines_total]),
-        m_is_mandatory(is_mandatory),
-        m_is_openmp(is_openmp),
-        m_inout_buf(0),
-        m_func_desc(0),
-        m_func_desc_size(0),
-        m_in_deps(0),
-        m_in_deps_total(0),
-        m_out_deps(0),
-        m_out_deps_total(0),
-        m_vars(0),
-        m_vars_extra(0),
-        m_status(status),
-        m_timer_data(timer_data)
-    {}
-
-    ~OffloadDescriptor()
-    {
-        if (m_in_deps != 0) {
-            free(m_in_deps);
-        }
-        if (m_out_deps != 0) {
-            free(m_out_deps);
-        }
-        if (m_func_desc != 0) {
-            free(m_func_desc);
-        }
-        if (m_vars != 0) {
-            free(m_vars);
-            free(m_vars_extra);
-        }
-    }
-
-    bool offload(const char *name, bool is_empty,
-                 VarDesc *vars, VarDesc2 *vars2, int vars_total,
-                 const void **waits, int num_waits, const void **signal,
-                 int entry_id, const void *stack_addr);
-    bool offload_finish();
-
-    bool is_signaled();
-
-    OffloadHostTimerData* get_timer_data() const {
-        return m_timer_data;
-    }
-
-private:
-    bool wait_dependencies(const void **waits, int num_waits);
-    bool setup_descriptors(VarDesc *vars, VarDesc2 *vars2, int vars_total,
-                           int entry_id, const void *stack_addr);
-    bool setup_misc_data(const char *name);
-    bool send_pointer_data(bool is_async);
-    bool send_noncontiguous_pointer_data(
-        int i,
-        PtrData* src_buf,
-        PtrData* dst_buf,
-        COIEVENT *event);
-    bool receive_noncontiguous_pointer_data(
-        int i,
-        char* src_data,
-        COIBUFFER dst_buf,
-        COIEVENT *event);
-
-    bool gather_copyin_data();
-
-    bool compute();
-
-    bool receive_pointer_data(bool is_async);
-    bool scatter_copyout_data();
-
-    void cleanup();
-
-    bool find_ptr_data(PtrData* &ptr_data, void *base, int64_t disp,
-                       int64_t length, bool error_does_not_exist = true);
-    bool alloc_ptr_data(PtrData* &ptr_data, void *base, int64_t disp,
-                        int64_t length, int64_t alloc_disp, int align);
-    bool init_static_ptr_data(PtrData *ptr_data);
-    bool init_mic_address(PtrData *ptr_data);
-    bool offload_stack_memory_manager(const void * stack_begin, int routine_id,
-                                      int buf_size, int align, bool *is_new);
-    bool nullify_target_stack(COIBUFFER targ_buf, uint64_t size);
-
-    bool gen_var_descs_for_pointer_array(int i);
-
-    void report_coi_error(error_types msg, COIRESULT res);
-    _Offload_result translate_coi_error(COIRESULT res) const;
-
-private:
-    typedef std::list<COIBUFFER> BufferList;
-
-    // extra data associated with each variable descriptor
-    struct VarExtra {
-        PtrData* src_data;
-        PtrData* dst_data;
-        AutoData* auto_data;
-        int64_t cpu_disp;
-        int64_t cpu_offset;
-        CeanReadRanges *read_rng_src;
-        CeanReadRanges *read_rng_dst;
-        int64_t ptr_arr_offset;
-        bool is_arr_ptr_el;
-    };
-
-    template<typename T> class ReadArrElements {
-    public:
-        ReadArrElements():
-            ranges(NULL),
-            el_size(sizeof(T)),
-            offset(0),
-            count(0),
-            is_empty(true),
-            base(NULL)
-        {}
-
-        bool read_next(bool flag)
-        {
-            if (flag != 0) {
-                if (is_empty) {
-                    if (ranges) {
-                        if (!get_next_range(ranges, &offset)) {
-                            // ranges are over
-                            return false;
-                        }
-                    }
-                    // all contiguous elements are over
-                    else if (count != 0) {
-                        return false;
-                    }
-
-                    length_cur = size;
-                }
-                else {
-                    offset += el_size;
-                }
-                val = (T)get_el_value(base, offset, el_size);
-                length_cur -= el_size;
-                count++;
-                is_empty = length_cur == 0;
-            }
-            return true;
-        }
-    public:
-        CeanReadRanges * ranges;
-        T       val;
-        int     el_size;
-        int64_t size,
-                offset,
-                length_cur;
-        bool    is_empty;
-        int     count;
-        char   *base;
-    };
-
-    // ptr_data for persistent auto objects
-    PtrData*    m_stack_ptr_data;
-    PtrDataList m_destroy_stack;
-
-    // Engine
-    Engine& m_device;
-
-    // if true offload is mandatory
-    bool m_is_mandatory;
-
-    // if true offload has openmp origin
-    const bool m_is_openmp;
-
-    // The Marshaller for the inputs of the offloaded region.
-    Marshaller m_in;
-
-    // The Marshaller for the outputs of the offloaded region.
-    Marshaller m_out;
-
-    // List of buffers that are passed to dispatch call
-    BufferList m_compute_buffers;
-
-    // List of buffers that need to be destroyed at the end of offload
-    BufferList m_destroy_buffers;
-
-    // Variable descriptors
-    VarDesc*  m_vars;
-    VarExtra* m_vars_extra;
-    int       m_vars_total;
-
-    // Pointer to a user-specified status variable
-    _Offload_status *m_status;
-
-    // Function descriptor
-    FunctionDescriptor* m_func_desc;
-    uint32_t            m_func_desc_size;
-
-    // Buffer for transferring copyin/copyout data
-    COIBUFFER m_inout_buf;
-
-    // Dependencies
-    COIEVENT *m_in_deps;
-    uint32_t  m_in_deps_total;
-    COIEVENT *m_out_deps;
-    uint32_t  m_out_deps_total;
-
-    // Timer data
-    OffloadHostTimerData *m_timer_data;
-
-    // copyin/copyout data length
-    uint64_t m_in_datalen;
-    uint64_t m_out_datalen;
-
-    // a boolean value calculated in setup_descriptors. If true we need to do
-    // a run function on the target. Otherwise it may be optimized away.
-    bool m_need_runfunction;
-};
-
-// Initialization types for MIC
-enum OffloadInitType {
-    c_init_on_start,         // all devices before entering main
-    c_init_on_offload,       // single device before starting the first offload
-    c_init_on_offload_all    // all devices before starting the first offload
-};
-
-// Initializes library and registers specified offload image.
-extern "C" void __offload_register_image(const void* image);
-extern "C" void __offload_unregister_image(const void* image);
-
-// Initializes offload runtime library.
-extern int __offload_init_library(void);
-
-// thread data for associating pipelines with threads
-extern pthread_key_t mic_thread_key;
-
-// Environment variables for devices
-extern MicEnvVar mic_env_vars;
-
-// CPU frequency
-extern uint64_t cpu_frequency;
-
-// LD_LIBRARY_PATH for MIC libraries
-extern char* mic_library_path;
-
-// stack size for target
-extern uint32_t mic_stack_size;
-
-// Preallocated memory size for buffers on MIC
-extern uint64_t mic_buffer_size;
-
-// Setting controlling inout proxy
-extern bool  mic_proxy_io;
-extern char* mic_proxy_fs_root;
-
-// Threshold for creating buffers with large pages
-extern uint64_t __offload_use_2mb_buffers;
-
-// offload initialization type
-extern OffloadInitType __offload_init_type;
-
-// Device number to offload to when device is not explicitly specified.
-extern int __omp_device_num;
-
-// target executable
-extern TargetImage* __target_exe;
-
-// IDB support
-
-// Called by the offload runtime after initialization of offload infrastructure
-// has been completed.
-extern "C" void  __dbg_target_so_loaded();
-
-// Called by the offload runtime when the offload infrastructure is about to be
-// shut down, currently at application exit.
-extern "C" void  __dbg_target_so_unloaded();
-
-// Null-terminated string containing path to the process image of the hosting
-// application (offload_main)
-#define MAX_TARGET_NAME 512
-extern "C" char  __dbg_target_exe_name[MAX_TARGET_NAME];
-
-// Integer specifying the process id
-extern "C" pid_t __dbg_target_so_pid;
-
-// Integer specifying the 0-based device number
-extern "C" int   __dbg_target_id;
-
-// Set to non-zero by the host-side debugger to enable offload debugging
-// support
-extern "C" int   __dbg_is_attached;
-
-// Major version of the debugger support API
-extern "C" const int __dbg_api_major_version;
-
-// Minor version of the debugger support API
-extern "C" const int __dbg_api_minor_version;
-
-#endif // OFFLOAD_HOST_H_INCLUDED
diff --git a/offload/src/offload_myo_host.cpp b/offload/src/offload_myo_host.cpp
deleted file mode 100644
index 2e1c186..0000000
--- a/offload/src/offload_myo_host.cpp
+++ /dev/null
@@ -1,805 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_myo_host.h"
-#include <errno.h>
-#include <malloc.h>
-#include "offload_host.h"
-
-#if defined(LINUX) || defined(FREEBSD)
-#include <mm_malloc.h>
-#endif
-
-#define MYO_VERSION1    "MYO_1.0"
-
-extern "C" void __cilkrts_cilk_for_32(void*, void*, uint32_t, int32_t);
-extern "C" void __cilkrts_cilk_for_64(void*, void*, uint64_t, int32_t);
-
-#ifndef TARGET_WINNT
-#pragma weak __cilkrts_cilk_for_32
-#pragma weak __cilkrts_cilk_for_64
-#endif // TARGET_WINNT
-
-#ifdef TARGET_WINNT
-#define MYO_TABLE_END_MARKER() reinterpret_cast<const char*>(-1)
-#else // TARGET_WINNT
-#define MYO_TABLE_END_MARKER() reinterpret_cast<const char*>(0)
-#endif // TARGET_WINNT
-
-class MyoWrapper {
-public:
-    MyoWrapper() : m_lib_handle(0), m_is_available(false)
-    {}
-
-    bool is_available() const {
-        return m_is_available;
-    }
-
-    bool LoadLibrary(void);
-
-    // unloads the library
-    void UnloadLibrary(void) {
-//        if (m_lib_handle != 0) {
-//            DL_close(m_lib_handle);
-//            m_lib_handle = 0;
-//        }
-    }
-
-    // Wrappers for MYO client functions
-    void LibInit(void *arg, void *func) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myoinit,
-                                 "%s(%p, %p)\n", __func__, arg, func);
-        CheckResult(__func__, m_lib_init(arg, func));
-    }
-
-    void LibFini(void) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myofini, "%s()\n", __func__);
-        m_lib_fini();
-    }
-
-    void* SharedMalloc(size_t size) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myosharedmalloc,
-                                 "%s(%lld)\n", __func__, size);
-        return m_shared_malloc(size);
-    }
-
-    void SharedFree(void *ptr) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myosharedfree,
-                                 "%s(%p)\n", __func__, ptr);
-        m_shared_free(ptr);
-    }
-
-    void* SharedAlignedMalloc(size_t size, size_t align) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myosharedalignedmalloc,
-                                 "%s(%lld, %lld)\n", __func__, size, align);
-        return m_shared_aligned_malloc(size, align);
-    }
-
-    void SharedAlignedFree(void *ptr) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myosharedalignedfree,
-                              "%s(%p)\n", __func__, ptr);
-        m_shared_aligned_free(ptr);
-    }
-
-    void Acquire(void) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myoacquire,
-                              "%s()\n", __func__);
-        CheckResult(__func__, m_acquire());
-    }
-
-    void Release(void) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myorelease,
-                            "%s()\n", __func__);
-        CheckResult(__func__, m_release());
-    }
-
-    void HostVarTablePropagate(void *table, int num_entries) const {
-        OFFLOAD_DEBUG_TRACE(4, "%s(%p, %d)\n", __func__, table, num_entries);
-        CheckResult(__func__, m_host_var_table_propagate(table, num_entries));
-    }
-
-    void HostFptrTableRegister(void *table, int num_entries,
-                               int ordered) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myoregister,
-                            "%s(%p, %d, %d)\n", __func__, table,
-                            num_entries, ordered);
-        CheckResult(__func__,
-                    m_host_fptr_table_register(table, num_entries, ordered));
-    }
-
-    void RemoteThunkCall(void *thunk, void *args, int device) {
-        OFFLOAD_DEBUG_TRACE(4, "%s(%p, %p, %d)\n", __func__, thunk, args,
-                            device);
-        CheckResult(__func__, m_remote_thunk_call(thunk, args, device));
-    }
-
-    MyoiRFuncCallHandle RemoteCall(char *func, void *args, int device) const {
-        OFFLOAD_DEBUG_TRACE(4, "%s(%s, %p, %d)\n", __func__, func, args,
-                            device);
-        return m_remote_call(func, args, device);
-    }
-
-    void GetResult(MyoiRFuncCallHandle handle) const {
-        OFFLOAD_DEBUG_TRACE(4, "%s(%p)\n", __func__, handle);
-        CheckResult(__func__, m_get_result(handle));
-    }
-
-private:
-    void CheckResult(const char *func, MyoError error) const {
-        if (error != MYO_SUCCESS) {
-             LIBOFFLOAD_ERROR(c_myowrapper_checkresult, func, error);
-            exit(1);
-        }
-    }
-
-private:
-    void* m_lib_handle;
-    bool  m_is_available;
-
-    // pointers to functions from myo library
-    MyoError (*m_lib_init)(void*, void*);
-    void     (*m_lib_fini)(void);
-    void*    (*m_shared_malloc)(size_t);
-    void     (*m_shared_free)(void*);
-    void*    (*m_shared_aligned_malloc)(size_t, size_t);
-    void     (*m_shared_aligned_free)(void*);
-    MyoError (*m_acquire)(void);
-    MyoError (*m_release)(void);
-    MyoError (*m_host_var_table_propagate)(void*, int);
-    MyoError (*m_host_fptr_table_register)(void*, int, int);
-    MyoError (*m_remote_thunk_call)(void*, void*, int);
-    MyoiRFuncCallHandle (*m_remote_call)(char*, void*, int);
-    MyoError (*m_get_result)(MyoiRFuncCallHandle);
-};
-
-bool MyoWrapper::LoadLibrary(void)
-{
-#ifndef TARGET_WINNT
-    const char *lib_name = "libmyo-client.so";
-#else // TARGET_WINNT
-    const char *lib_name = "myo-client.dll";
-#endif // TARGET_WINNT
-
-    OFFLOAD_DEBUG_TRACE(2, "Loading MYO library %s ...\n", lib_name);
-
-    m_lib_handle = DL_open(lib_name);
-    if (m_lib_handle == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to load the library. errno = %d\n",
-                            errno);
-        return false;
-    }
-
-    m_lib_init = (MyoError (*)(void*, void*))
-        DL_sym(m_lib_handle, "myoiLibInit", MYO_VERSION1);
-    if (m_lib_init == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoiLibInit");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_lib_fini = (void (*)(void))
-        DL_sym(m_lib_handle, "myoiLibFini", MYO_VERSION1);
-    if (m_lib_fini == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoiLibFini");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_shared_malloc = (void* (*)(size_t))
-        DL_sym(m_lib_handle, "myoSharedMalloc", MYO_VERSION1);
-    if (m_shared_malloc == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoSharedMalloc");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_shared_free = (void (*)(void*))
-        DL_sym(m_lib_handle, "myoSharedFree", MYO_VERSION1);
-    if (m_shared_free == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoSharedFree");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_shared_aligned_malloc = (void* (*)(size_t, size_t))
-        DL_sym(m_lib_handle, "myoSharedAlignedMalloc", MYO_VERSION1);
-    if (m_shared_aligned_malloc == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoSharedAlignedMalloc");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_shared_aligned_free = (void (*)(void*))
-        DL_sym(m_lib_handle, "myoSharedAlignedFree", MYO_VERSION1);
-    if (m_shared_aligned_free == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoSharedAlignedFree");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_acquire = (MyoError (*)(void))
-        DL_sym(m_lib_handle, "myoAcquire", MYO_VERSION1);
-    if (m_acquire == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoAcquire");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_release = (MyoError (*)(void))
-        DL_sym(m_lib_handle, "myoRelease", MYO_VERSION1);
-    if (m_release == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoRelease");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_host_var_table_propagate = (MyoError (*)(void*, int))
-        DL_sym(m_lib_handle, "myoiHostVarTablePropagate", MYO_VERSION1);
-    if (m_host_var_table_propagate == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoiHostVarTablePropagate");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_host_fptr_table_register = (MyoError (*)(void*, int, int))
-        DL_sym(m_lib_handle, "myoiHostFptrTableRegister", MYO_VERSION1);
-    if (m_host_fptr_table_register == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoiHostFptrTableRegister");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_remote_thunk_call = (MyoError (*)(void*, void*, int))
-        DL_sym(m_lib_handle, "myoiRemoteThunkCall", MYO_VERSION1);
-    if (m_remote_thunk_call == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoiRemoteThunkCall");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_remote_call = (MyoiRFuncCallHandle (*)(char*, void*, int))
-        DL_sym(m_lib_handle, "myoiRemoteCall", MYO_VERSION1);
-    if (m_remote_call == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoiRemoteCall");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_get_result = (MyoError (*)(MyoiRFuncCallHandle))
-        DL_sym(m_lib_handle, "myoiGetResult", MYO_VERSION1);
-    if (m_get_result == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoiGetResult");
-        UnloadLibrary();
-        return false;
-    }
-
-    OFFLOAD_DEBUG_TRACE(2, "The library was successfully loaded\n");
-
-    m_is_available = true;
-
-    return true;
-}
-
-static bool myo_is_available;
-static MyoWrapper myo_wrapper;
-
-struct MyoTable
-{
-    MyoTable(SharedTableEntry *tab, int len) : var_tab(tab), var_tab_len(len)
-    {}
-
-    SharedTableEntry*   var_tab;
-    int                 var_tab_len;
-};
-
-typedef std::list<MyoTable> MyoTableList;
-static MyoTableList __myo_table_list;
-static mutex_t      __myo_table_lock;
-static bool         __myo_tables = false;
-
-static void __offload_myo_shared_table_register(SharedTableEntry *entry);
-static void __offload_myo_shared_init_table_register(InitTableEntry* entry);
-static void __offload_myo_fptr_table_register(FptrTableEntry *entry);
-
-static void __offload_myoLoadLibrary_once(void)
-{
-    if (__offload_init_library()) {
-        myo_wrapper.LoadLibrary();
-    }
-}
-
-static bool __offload_myoLoadLibrary(void)
-{
-    static OffloadOnceControl ctrl = OFFLOAD_ONCE_CONTROL_INIT;
-    __offload_run_once(&ctrl, __offload_myoLoadLibrary_once);
-
-    return myo_wrapper.is_available();
-}
-
-static void __offload_myoInit_once(void)
-{
-    if (!__offload_myoLoadLibrary()) {
-        return;
-    }
-
-    // initialize all devices
-    for (int i = 0; i < mic_engines_total; i++) {
-        mic_engines[i].init();
-    }
-
-    // load and initialize MYO library
-    OFFLOAD_DEBUG_TRACE(2, "Initializing MYO library ...\n");
-
-    COIEVENT events[MIC_ENGINES_MAX];
-    MyoiUserParams params[MIC_ENGINES_MAX+1];
-
-    // load target library to all devices
-    for (int i = 0; i < mic_engines_total; i++) {
-        mic_engines[i].init_myo(&events[i]);
-
-        params[i].type = MYOI_USERPARAMS_DEVID;
-        params[i].nodeid = mic_engines[i].get_physical_index() + 1;
-    }
-
-    params[mic_engines_total].type = MYOI_USERPARAMS_LAST_MSG;
-
-    // initialize myo runtime on host
-    myo_wrapper.LibInit(params, 0);
-
-    // wait for the target init calls to finish
-    COIRESULT res;
-    res = COI::EventWait(mic_engines_total, events, -1, 1, 0, 0);
-    if (res != COI_SUCCESS) {
-        LIBOFFLOAD_ERROR(c_event_wait, res);
-        exit(1);
-    }
-
-    myo_is_available = true;
-
-    OFFLOAD_DEBUG_TRACE(2, "Initializing MYO library ... done\n");
-}
-
-static bool __offload_myoInit(void)
-{
-    static OffloadOnceControl ctrl = OFFLOAD_ONCE_CONTROL_INIT;
-    __offload_run_once(&ctrl, __offload_myoInit_once);
-
-    // register pending shared var tables
-    if (myo_is_available && __myo_tables) {
-        mutex_locker_t locker(__myo_table_lock);
-
-        if (__myo_tables) {
-            //  Register tables with MYO so it can propagate to target.
-            for(MyoTableList::const_iterator it = __myo_table_list.begin();
-                it != __myo_table_list.end(); ++it) {
-#ifdef TARGET_WINNT
-                for (SharedTableEntry *entry = it->var_tab;
-                     entry->varName != MYO_TABLE_END_MARKER(); entry++) {
-                    if (entry->varName == 0) {
-                        continue;
-                    }
-                    myo_wrapper.HostVarTablePropagate(entry, 1);
-                }
-#else // TARGET_WINNT
-                myo_wrapper.HostVarTablePropagate(it->var_tab,
-                                                  it->var_tab_len);
-#endif // TARGET_WINNT
-            }
-
-            __myo_table_list.clear();
-            __myo_tables = false;
-        }
-    }
-
-    return myo_is_available;
-}
-
-static bool shared_table_entries(
-    SharedTableEntry *entry
-)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
-
-    for (; entry->varName != MYO_TABLE_END_MARKER(); entry++) {
-#ifdef TARGET_WINNT
-        if (entry->varName == 0) {
-            continue;
-        }
-#endif // TARGET_WINNT
-
-        return true;
-    }
-
-    return false;
-}
-
-static bool fptr_table_entries(
-    FptrTableEntry *entry
-)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
-
-    for (; entry->funcName != MYO_TABLE_END_MARKER(); entry++) {
-#ifdef TARGET_WINNT
-        if (entry->funcName == 0) {
-            continue;
-        }
-#endif // TARGET_WINNT
-
-        return true;
-    }
-
-    return false;
-}
-
-extern "C" void __offload_myoRegisterTables(
-    InitTableEntry* init_table,
-    SharedTableEntry *shared_table,
-    FptrTableEntry *fptr_table
-)
-{
-    // check whether we need to initialize MYO library. It is
-    // initialized only if at least one myo table is not empty
-    if (shared_table_entries(shared_table) || fptr_table_entries(fptr_table)) {
-        // make sure myo library is loaded
-        __offload_myoLoadLibrary();
-
-        // register tables
-        __offload_myo_shared_table_register(shared_table);
-        __offload_myo_fptr_table_register(fptr_table);
-        __offload_myo_shared_init_table_register(init_table);
-    }
-}
-
-void __offload_myoFini(void)
-{
-    if (myo_is_available) {
-        OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
-
-        COIEVENT events[MIC_ENGINES_MAX];
-
-        // kick off myoiLibFini calls on all devices
-        for (int i = 0; i < mic_engines_total; i++) {
-            mic_engines[i].fini_myo(&events[i]);
-        }
-
-        // cleanup myo runtime on host
-        myo_wrapper.LibFini();
-
-        // wait for the target fini calls to finish
-        COIRESULT res;
-        res = COI::EventWait(mic_engines_total, events, -1, 1, 0, 0);
-        if (res != COI_SUCCESS) {
-            LIBOFFLOAD_ERROR(c_event_wait, res);
-            exit(1);
-        }
-    }
-}
-
-static void __offload_myo_shared_table_register(
-    SharedTableEntry *entry
-)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
-
-    SharedTableEntry *start = entry;
-    int entries = 0;
-
-    // allocate shared memory for vars
-    for (; entry->varName != MYO_TABLE_END_MARKER(); entry++) {
-#ifdef TARGET_WINNT
-        if (entry->varName == 0) {
-            OFFLOAD_DEBUG_TRACE(4, "skip registering a NULL MyoSharedTable entry\n");
-            continue;
-        }
-#endif // TARGET_WINNT
-
-        OFFLOAD_DEBUG_TRACE(4, "registering MyoSharedTable entry for %s @%p\n",
-                            entry->varName, entry);
-
-        // Invoke the function to create shared memory
-        reinterpret_cast<void(*)(void)>(entry->sharedAddr)();
-        entries++;
-    }
-
-    // and table to the list if it is not empty
-    if (entries > 0) {
-        mutex_locker_t locker(__myo_table_lock);
-        __myo_table_list.push_back(MyoTable(start, entries));
-        __myo_tables = true;
-    }
-}
-
-static void __offload_myo_shared_init_table_register(InitTableEntry* entry)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
-
-#ifdef TARGET_WINNT
-    for (; entry->funcName != MYO_TABLE_END_MARKER(); entry++) {
-        if (entry->funcName == 0) {
-            OFFLOAD_DEBUG_TRACE(4, "skip registering a NULL MyoSharedInit entry\n");
-            continue;
-        }
-
-        //  Invoke the function to init the shared memory
-        entry->func();
-    }
-#else // TARGET_WINNT
-    for (; entry->func != 0; entry++) {
-        // Invoke the function to init the shared memory
-        entry->func();
-    }
-#endif // TARGET_WINNT
-}
-
-static void __offload_myo_fptr_table_register(
-    FptrTableEntry *entry
-)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
-
-    FptrTableEntry *start = entry;
-    int entries = 0;
-
-    for (; entry->funcName != MYO_TABLE_END_MARKER(); entry++) {
-#ifdef TARGET_WINNT
-        if (entry->funcName == 0) {
-            OFFLOAD_DEBUG_TRACE(4, "skip registering a NULL MyoFptrTable entry\n");
-            continue;
-        }
-#endif // TARGET_WINNT
-
-        if (!myo_wrapper.is_available()) {
-            *(static_cast<void**>(entry->localThunkAddr)) = entry->funcAddr;
-        }
-
-        OFFLOAD_DEBUG_TRACE(4, "registering MyoFptrTable entry for %s @%p\n",
-                            entry->funcName, entry);
-
-#ifdef TARGET_WINNT
-        if (myo_wrapper.is_available()) {
-            myo_wrapper.HostFptrTableRegister(entry, 1, false);
-        }
-#endif // TARGET_WINNT
-
-        entries++;
-    }
-
-#ifndef TARGET_WINNT
-    if (myo_wrapper.is_available() && entries > 0) {
-        myo_wrapper.HostFptrTableRegister(start, entries, false);
-    }
-#endif // TARGET_WINNT
-}
-
-extern "C" int __offload_myoIsAvailable(int target_number)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%d)\n", __func__, target_number);
-
-    if (target_number >= -2) {
-        bool is_default_number = (target_number == -2);
-
-        if (__offload_myoInit()) {
-            if (target_number >= 0) {
-                // User provided the device number
-                int num = target_number % mic_engines_total;
-
-                // reserve device in ORSL
-                target_number = ORSL::reserve(num) ? num : -1;
-            }
-            else {
-                // try to use device 0
-                target_number = ORSL::reserve(0) ? 0 : -1;
-            }
-
-            // make sure device is initialized
-            if (target_number >= 0) {
-                mic_engines[target_number].init();
-            }
-        }
-        else {
-            // fallback to CPU
-            target_number = -1;
-        }
-
-        if (target_number < 0 && !is_default_number) {
-            LIBOFFLOAD_ERROR(c_device_is_not_available);
-            exit(1);
-        }
-    }
-    else {
-        LIBOFFLOAD_ERROR(c_invalid_device_number);
-        exit(1);
-    }
-
-    return target_number;
-}
-
-extern "C" void __offload_myoiRemoteIThunkCall(
-    void *thunk,
-    void *arg,
-    int target_number
-)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p, %p, %d)\n", __func__, thunk, arg,
-                        target_number);
-
-    myo_wrapper.Release();
-    myo_wrapper.RemoteThunkCall(thunk, arg, target_number);
-    myo_wrapper.Acquire();
-
-    ORSL::release(target_number);
-}
-
-extern "C" void* _Offload_shared_malloc(size_t size)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%lld)\n", __func__, size);
-
-    if (__offload_myoLoadLibrary()) {
-        return myo_wrapper.SharedMalloc(size);
-    }
-    else {
-        return malloc(size);
-    }
-}
-
-extern "C" void _Offload_shared_free(void *ptr)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, ptr);
-
-    if (__offload_myoLoadLibrary()) {
-        myo_wrapper.SharedFree(ptr);
-    }
-    else {
-        free(ptr);
-    }
-}
-
-extern "C" void* _Offload_shared_aligned_malloc(size_t size, size_t align)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%lld, %lld)\n", __func__, size, align);
-
-    if (__offload_myoLoadLibrary()) {
-        return myo_wrapper.SharedAlignedMalloc(size, align);
-    }
-    else {
-        if (align < sizeof(void*)) {
-            align = sizeof(void*);
-        }
-        return _mm_malloc(size, align);
-    }
-}
-
-extern "C" void _Offload_shared_aligned_free(void *ptr)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, ptr);
-
-    if (__offload_myoLoadLibrary()) {
-        myo_wrapper.SharedAlignedFree(ptr);
-    }
-    else {
-        _mm_free(ptr);
-    }
-}
-
-extern "C" void __intel_cilk_for_32_offload(
-    int size,
-    void (*copy_constructor)(void*, void*),
-    int target_number,
-    void *raddr,
-    void *closure_object,
-    unsigned int iters,
-    unsigned int grain_size)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
-
-    target_number = __offload_myoIsAvailable(target_number);
-    if (target_number >= 0) {
-        struct S {
-            void *M1;
-            unsigned int M2;
-            unsigned int M3;
-            char closure[];
-        } *args;
-
-        args = (struct S*) _Offload_shared_malloc(sizeof(struct S) + size);
-        args->M1 = raddr;
-        args->M2 = iters;
-        args->M3 = grain_size;
-
-        if (copy_constructor == 0) {
-            memcpy(args->closure, closure_object, size);
-        }
-        else {
-            copy_constructor(args->closure, closure_object);
-        }
-
-        myo_wrapper.Release();
-        myo_wrapper.GetResult(
-            myo_wrapper.RemoteCall("__intel_cilk_for_32_offload",
-                                   args, target_number)
-        );
-        myo_wrapper.Acquire();
-
-        _Offload_shared_free(args);
-
-        ORSL::release(target_number);
-    }
-    else {
-        __cilkrts_cilk_for_32(raddr,
-                              closure_object,
-                              iters,
-                              grain_size);
-    }
-}
-
-extern "C" void __intel_cilk_for_64_offload(
-    int size,
-    void (*copy_constructor)(void*, void*),
-    int target_number,
-    void *raddr,
-    void *closure_object,
-    uint64_t iters,
-    uint64_t grain_size)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
-
-    target_number = __offload_myoIsAvailable(target_number);
-    if (target_number >= 0) {
-        struct S {
-            void *M1;
-            uint64_t M2;
-            uint64_t M3;
-            char closure[];
-        } *args;
-
-        args = (struct S*) _Offload_shared_malloc(sizeof(struct S) + size);
-        args->M1 = raddr;
-        args->M2 = iters;
-        args->M3 = grain_size;
-
-        if (copy_constructor == 0) {
-            memcpy(args->closure, closure_object, size);
-        }
-        else {
-            copy_constructor(args->closure, closure_object);
-        }
-
-        myo_wrapper.Release();
-        myo_wrapper.GetResult(
-            myo_wrapper.RemoteCall("__intel_cilk_for_64_offload", args,
-                                   target_number)
-        );
-        myo_wrapper.Acquire();
-
-        _Offload_shared_free(args);
-
-        ORSL::release(target_number);
-    }
-    else {
-        __cilkrts_cilk_for_64(raddr,
-                              closure_object,
-                              iters,
-                              grain_size);
-    }
-}
diff --git a/offload/src/offload_myo_host.h b/offload/src/offload_myo_host.h
deleted file mode 100644
index 92a61f4..0000000
--- a/offload/src/offload_myo_host.h
+++ /dev/null
@@ -1,80 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef OFFLOAD_MYO_HOST_H_INCLUDED
-#define OFFLOAD_MYO_HOST_H_INCLUDED
-
-#include <myotypes.h>
-#include <myoimpl.h>
-#include <myo.h>
-#include "offload.h"
-
-typedef MyoiSharedVarEntry      SharedTableEntry;
-//typedef MyoiHostSharedFptrEntry FptrTableEntry;
-typedef struct {
-    //! Function Name
-    const char *funcName;
-    //! Function Address
-    void *funcAddr;
-    //! Local Thunk Address
-    void *localThunkAddr;
-#ifdef TARGET_WINNT
-    // Dummy to pad up to 32 bytes
-    void *dummy;
-#endif // TARGET_WINNT
-} FptrTableEntry;
-
-struct InitTableEntry {
-#ifdef TARGET_WINNT
-    // Dummy to pad up to 16 bytes
-    // Function Name
-    const char *funcName;
-#endif // TARGET_WINNT
-    void (*func)(void);
-};
-
-#ifdef TARGET_WINNT
-#define OFFLOAD_MYO_SHARED_TABLE_SECTION_START          ".MyoSharedTable$a"
-#define OFFLOAD_MYO_SHARED_TABLE_SECTION_END            ".MyoSharedTable$z"
-
-#define OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_START     ".MyoSharedInitTable$a"
-#define OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_END       ".MyoSharedInitTable$z"
-
-#define OFFLOAD_MYO_FPTR_TABLE_SECTION_START            ".MyoFptrTable$a"
-#define OFFLOAD_MYO_FPTR_TABLE_SECTION_END              ".MyoFptrTable$z"
-#else  // TARGET_WINNT
-#define OFFLOAD_MYO_SHARED_TABLE_SECTION_START          ".MyoSharedTable."
-#define OFFLOAD_MYO_SHARED_TABLE_SECTION_END            ".MyoSharedTable."
-
-#define OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_START     ".MyoSharedInitTable."
-#define OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_END       ".MyoSharedInitTable."
-
-#define OFFLOAD_MYO_FPTR_TABLE_SECTION_START            ".MyoFptrTable."
-#define OFFLOAD_MYO_FPTR_TABLE_SECTION_END              ".MyoFptrTable."
-#endif // TARGET_WINNT
-
-#pragma section(OFFLOAD_MYO_SHARED_TABLE_SECTION_START, read, write)
-#pragma section(OFFLOAD_MYO_SHARED_TABLE_SECTION_END, read, write)
-
-#pragma section(OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_START, read, write)
-#pragma section(OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_END, read, write)
-
-#pragma section(OFFLOAD_MYO_FPTR_TABLE_SECTION_START, read, write)
-#pragma section(OFFLOAD_MYO_FPTR_TABLE_SECTION_END, read, write)
-
-extern "C" void __offload_myoRegisterTables(
-    InitTableEntry *init_table,
-    SharedTableEntry *shared_table,
-    FptrTableEntry *fptr_table
-);
-
-extern void __offload_myoFini(void);
-
-#endif // OFFLOAD_MYO_HOST_H_INCLUDED
diff --git a/offload/src/offload_myo_target.cpp b/offload/src/offload_myo_target.cpp
deleted file mode 100644
index eeb1c4f..0000000
--- a/offload/src/offload_myo_target.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_myo_target.h"
-#include "offload_target.h"
-
-extern "C" void __cilkrts_cilk_for_32(void*, void*, uint32_t, int32_t);
-extern "C" void __cilkrts_cilk_for_64(void*, void*, uint64_t, int32_t);
-
-#pragma weak __cilkrts_cilk_for_32
-#pragma weak __cilkrts_cilk_for_64
-
-static void CheckResult(const char *func, MyoError error) {
-    if (error != MYO_SUCCESS) {
-       LIBOFFLOAD_ERROR(c_myotarget_checkresult, func, error);
-        exit(1);
-    }
-}
-
-static void __offload_myo_shared_table_register(SharedTableEntry *entry)
-{
-    int entries = 0;
-    SharedTableEntry *t_start;
-
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
-
-    t_start = entry;
-    while (t_start->varName != 0) {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_mic_myo_shared,
-                              "myo shared entry name = \"%s\" addr = %p\n",
-                              t_start->varName, t_start->sharedAddr);
-        t_start++;
-        entries++;
-    }
-
-    if (entries > 0) {
-        OFFLOAD_DEBUG_TRACE(3, "myoiMicVarTableRegister(%p, %d)\n", entry,
-                            entries);
-        CheckResult("myoiMicVarTableRegister",
-                    myoiMicVarTableRegister(entry, entries));
-    }
-}
-
-static void __offload_myo_fptr_table_register(
-    FptrTableEntry *entry
-)
-{
-    int entries = 0;
-    FptrTableEntry *t_start;
-
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
-
-    t_start = entry;
-    while (t_start->funcName != 0) {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_mic_myo_fptr,
-                              "myo fptr entry name = \"%s\" addr = %p\n",
-                              t_start->funcName, t_start->funcAddr);
-        t_start++;
-        entries++;
-    }
-
-    if (entries > 0) {
-        OFFLOAD_DEBUG_TRACE(3, "myoiTargetFptrTableRegister(%p, %d, 0)\n",
-                            entry, entries);
-        CheckResult("myoiTargetFptrTableRegister",
-                    myoiTargetFptrTableRegister(entry, entries, 0));
-    }
-}
-
-extern "C" void __offload_myoAcquire(void)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
-    CheckResult("myoAcquire", myoAcquire());
-}
-
-extern "C" void __offload_myoRelease(void)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
-    CheckResult("myoRelease", myoRelease());
-}
-
-extern "C" void __intel_cilk_for_32_offload_wrapper(void *args_)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
-
-    struct S {
-        void *M1;
-        unsigned int M2;
-        unsigned int M3;
-        char closure[];
-    } *args = (struct S*) args_;
-
-    __cilkrts_cilk_for_32(args->M1, args->closure, args->M2, args->M3);
-}
-
-extern "C" void __intel_cilk_for_64_offload_wrapper(void *args_)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
-
-    struct S {
-        void *M1;
-        uint64_t M2;
-        uint64_t M3;
-        char closure[];
-    } *args = (struct S*) args_;
-
-    __cilkrts_cilk_for_64(args->M1, args->closure, args->M2, args->M3);
-}
-
-static void __offload_myo_once_init(void)
-{
-    CheckResult("myoiRemoteFuncRegister",
-                myoiRemoteFuncRegister(
-                    (MyoiRemoteFuncType) __intel_cilk_for_32_offload_wrapper,
-                    "__intel_cilk_for_32_offload"));
-    CheckResult("myoiRemoteFuncRegister",
-                myoiRemoteFuncRegister(
-                    (MyoiRemoteFuncType) __intel_cilk_for_64_offload_wrapper,
-                    "__intel_cilk_for_64_offload"));
-}
-
-extern "C" void __offload_myoRegisterTables(
-    SharedTableEntry *shared_table,
-    FptrTableEntry *fptr_table
-)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
-
-    // one time registration of Intel(R) Cilk(TM) language entries
-    static pthread_once_t once_control = PTHREAD_ONCE_INIT;
-    pthread_once(&once_control, __offload_myo_once_init);
-
-    // register module's tables
-    if (shared_table->varName == 0 && fptr_table->funcName == 0) {
-        return;
-    }
-
-    __offload_myo_shared_table_register(shared_table);
-    __offload_myo_fptr_table_register(fptr_table);
-}
-
-extern "C" void* _Offload_shared_malloc(size_t size)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%lld)\n", __func__, size);
-    return myoSharedMalloc(size);
-}
-
-extern "C" void _Offload_shared_free(void *ptr)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, ptr);
-    myoSharedFree(ptr);
-}
-
-extern "C" void* _Offload_shared_aligned_malloc(size_t size, size_t align)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%lld, %lld)\n", __func__, size, align);
-    return myoSharedAlignedMalloc(size, align);
-}
-
-extern "C" void _Offload_shared_aligned_free(void *ptr)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, ptr);
-    myoSharedAlignedFree(ptr);
-}
-
-// temporary workaround for blocking behavior of myoiLibInit/Fini calls
-extern "C" void __offload_myoLibInit()
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s()\n", __func__);
-    CheckResult("myoiLibInit", myoiLibInit(0, 0));
-}
-
-extern "C" void __offload_myoLibFini()
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s()\n", __func__);
-    myoiLibFini();
-}
diff --git a/offload/src/offload_myo_target.h b/offload/src/offload_myo_target.h
deleted file mode 100644
index 8b7f789..0000000
--- a/offload/src/offload_myo_target.h
+++ /dev/null
@@ -1,54 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef OFFLOAD_MYO_TARGET_H_INCLUDED
-#define OFFLOAD_MYO_TARGET_H_INCLUDED
-
-#include <myotypes.h>
-#include <myoimpl.h>
-#include <myo.h>
-#include "offload.h"
-
-typedef MyoiSharedVarEntry          SharedTableEntry;
-typedef MyoiTargetSharedFptrEntry   FptrTableEntry;
-
-#ifdef TARGET_WINNT
-#define OFFLOAD_MYO_SHARED_TABLE_SECTION_START          ".MyoSharedTable$a"
-#define OFFLOAD_MYO_SHARED_TABLE_SECTION_END            ".MyoSharedTable$z"
-
-#define OFFLOAD_MYO_FPTR_TABLE_SECTION_START            ".MyoFptrTable$a"
-#define OFFLOAD_MYO_FPTR_TABLE_SECTION_END              ".MyoFptrTable$z"
-#else  // TARGET_WINNT
-#define OFFLOAD_MYO_SHARED_TABLE_SECTION_START          ".MyoSharedTable."
-#define OFFLOAD_MYO_SHARED_TABLE_SECTION_END            ".MyoSharedTable."
-
-#define OFFLOAD_MYO_FPTR_TABLE_SECTION_START            ".MyoFptrTable."
-#define OFFLOAD_MYO_FPTR_TABLE_SECTION_END              ".MyoFptrTable."
-#endif // TARGET_WINNT
-
-#pragma section(OFFLOAD_MYO_SHARED_TABLE_SECTION_START, read, write)
-#pragma section(OFFLOAD_MYO_SHARED_TABLE_SECTION_END, read, write)
-
-#pragma section(OFFLOAD_MYO_FPTR_TABLE_SECTION_START, read, write)
-#pragma section(OFFLOAD_MYO_FPTR_TABLE_SECTION_END, read, write)
-
-extern "C" void __offload_myoRegisterTables(
-    SharedTableEntry *shared_table,
-    FptrTableEntry *fptr_table
-);
-
-extern "C" void __offload_myoAcquire(void);
-extern "C" void __offload_myoRelease(void);
-
-// temporary workaround for blocking behavior for myoiLibInit/Fini calls
-extern "C" void __offload_myoLibInit();
-extern "C" void __offload_myoLibFini();
-
-#endif // OFFLOAD_MYO_TARGET_H_INCLUDED
diff --git a/offload/src/offload_omp_host.cpp b/offload/src/offload_omp_host.cpp
deleted file mode 100644
index edd4445..0000000
--- a/offload/src/offload_omp_host.cpp
+++ /dev/null
@@ -1,851 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include <omp.h>
-#include "offload.h"
-#include "compiler_if_host.h"
-
-// OpenMP API
-
-void omp_set_default_device(int num)
-{
-    if (num >= 0) {
-        __omp_device_num = num;
-    }
-}
-
-int omp_get_default_device(void)
-{
-    return __omp_device_num;
-}
-
-int omp_get_num_devices()
-{
-    __offload_init_library();
-    return mic_engines_total;
-}
-
-// OpenMP API wrappers
-
-static void omp_set_int_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int setting,
-    const char* f_name
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          f_name, 0);
-    if (ofld) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(int);
-        vars[0].count = 1;
-        vars[0].ptr = &setting;
-
-        OFFLOAD_OFFLOAD(ofld, f_name, 0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-static int omp_get_int_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    const char * f_name
-)
-{
-    int setting = 0;
-
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          f_name, 0);
-    if (ofld) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_out;
-        vars[0].size = sizeof(int);
-        vars[0].count = 1;
-        vars[0].ptr = &setting;
-
-        OFFLOAD_OFFLOAD(ofld, f_name, 0, 1, vars, NULL, 0, 0, 0);
-    }
-    return setting;
-}
-
-void omp_set_num_threads_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-)
-{
-    omp_set_int_target(target_type, target_number, num_threads,
-                       "omp_set_num_threads_target");
-}
-
-int omp_get_max_threads_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "omp_get_max_threads_target");
-}
-
-int omp_get_num_procs_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "omp_get_num_procs_target");
-}
-
-void omp_set_dynamic_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-)
-{
-    omp_set_int_target(target_type, target_number, num_threads,
-                       "omp_set_dynamic_target");
-}
-
-int omp_get_dynamic_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "omp_get_dynamic_target");
-}
-
-void omp_set_nested_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int nested
-)
-{
-    omp_set_int_target(target_type, target_number, nested,
-                       "omp_set_nested_target");
-}
-
-int omp_get_nested_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "omp_get_nested_target");
-}
-
-void omp_set_schedule_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_sched_t kind,
-    int modifier
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[2] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(omp_sched_t);
-        vars[0].count = 1;
-        vars[0].ptr = &kind;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_in;
-        vars[1].size = sizeof(int);
-        vars[1].count = 1;
-        vars[1].ptr = &modifier;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_set_schedule_target",
-                        0, 2, vars, NULL, 0, 0, 0);
-    }
-}
-
-void omp_get_schedule_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_sched_t *kind,
-    int *modifier
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[2] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_out;
-        vars[0].size = sizeof(omp_sched_t);
-        vars[0].count = 1;
-        vars[0].ptr = kind;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_out;
-        vars[1].size = sizeof(int);
-        vars[1].count = 1;
-        vars[1].ptr = modifier;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_get_schedule_target",
-                        0, 2, vars, NULL, 0, 0, 0);
-    }
-}
-
-// lock API functions
-
-void omp_init_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_out;
-        vars[0].size = sizeof(omp_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_init_lock_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-void omp_destroy_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(omp_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_destroy_lock_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-void omp_set_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_inout;
-        vars[0].size = sizeof(omp_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_set_lock_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-void omp_unset_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_inout;
-        vars[0].size = sizeof(omp_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_unset_lock_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-int omp_test_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-    int result = 0;
-
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[2] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_inout;
-        vars[0].size = sizeof(omp_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_out;
-        vars[1].size = sizeof(int);
-        vars[1].count = 1;
-        vars[1].ptr = &result;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_test_lock_target",
-                        0, 2, vars, NULL, 0, 0, 0);
-    }
-    return result;
-}
-
-// nested lock API functions
-
-void omp_init_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_out;
-        vars[0].size = sizeof(omp_nest_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_init_nest_lock_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-void omp_destroy_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(omp_nest_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_destroy_nest_lock_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-void omp_set_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_inout;
-        vars[0].size = sizeof(omp_nest_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_set_nest_lock_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-void omp_unset_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_inout;
-        vars[0].size = sizeof(omp_nest_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_unset_nest_lock_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-int omp_test_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-    int result = 0;
-
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[2] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_inout;
-        vars[0].size = sizeof(omp_nest_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_out;
-        vars[1].size = sizeof(int);
-        vars[1].count = 1;
-        vars[1].ptr = &result;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_test_nest_lock_target",
-                        0, 2, vars, NULL, 0, 0, 0);
-    }
-    return result;
-}
-
-// kmp API functions
-
-void kmp_set_stacksize_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int size
-)
-{
-    omp_set_int_target(target_type, target_number, size,
-                       "kmp_set_stacksize_target");
-}
-
-int kmp_get_stacksize_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "kmp_get_stacksize_target");
-}
-
-void kmp_set_stacksize_s_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    size_t size
-)
-{
-    omp_set_int_target(target_type, target_number, size,
-                       "kmp_set_stacksize_s_target");
-}
-
-size_t kmp_get_stacksize_s_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "kmp_get_stacksize_s_target");
-}
-
-void kmp_set_blocktime_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int time
-)
-{
-    omp_set_int_target(target_type, target_number, time,
-                       "kmp_set_blocktime_target");
-}
-
-int kmp_get_blocktime_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "kmp_get_blocktime_target");
-}
-
-void kmp_set_library_serial_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        OFFLOAD_OFFLOAD(ofld, "kmp_set_library_serial_target",
-                        0, 0, 0, 0, 0, 0, 0);
-    }
-}
-
-void kmp_set_library_turnaround_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        OFFLOAD_OFFLOAD(ofld, "kmp_set_library_turnaround_target",
-                        0, 0, 0, 0, 0, 0, 0);
-    }
-}
-
-void kmp_set_library_throughput_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        OFFLOAD_OFFLOAD(ofld, "kmp_set_library_throughput_target",
-                        0, 0, 0, 0, 0, 0, 0);
-    }
-}
-
-void kmp_set_library_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int mode
-)
-{
-    omp_set_int_target(target_type, target_number, mode,
-                       "kmp_set_library_target");
-}
-
-int kmp_get_library_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "kmp_get_library_target");
-}
-
-void kmp_set_defaults_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    char const *defaults
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_string_ptr;
-        vars[0].type.dst = c_string_ptr;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].alloc_if = 1;
-        vars[0].free_if = 1;
-        vars[0].ptr = &defaults;
-
-        OFFLOAD_OFFLOAD(ofld, "kmp_set_defaults_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-// affinity API functions
-
-void kmp_create_affinity_mask_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_out;
-        vars[0].size = sizeof(kmp_affinity_mask_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = mask;
-
-        OFFLOAD_OFFLOAD(ofld, "kmp_create_affinity_mask_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-void kmp_destroy_affinity_mask_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(kmp_affinity_mask_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = mask;
-
-        OFFLOAD_OFFLOAD(ofld, "kmp_destroy_affinity_mask_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-int kmp_set_affinity_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    int result = 1;
-
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[2] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(kmp_affinity_mask_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = mask;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_out;
-        vars[1].size = sizeof(int);
-        vars[1].count = 1;
-        vars[1].ptr = &result;
-
-        OFFLOAD_OFFLOAD(ofld, "kmp_set_affinity_target",
-                        0, 2, vars, NULL, 0, 0, 0);
-    }
-    return result;
-}
-
-int kmp_get_affinity_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    int result = 1;
-
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[2] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_inout;
-        vars[0].size = sizeof(kmp_affinity_mask_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = mask;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_out;
-        vars[1].size = sizeof(int);
-        vars[1].count = 1;
-        vars[1].ptr = &result;
-
-        OFFLOAD_OFFLOAD(ofld, "kmp_get_affinity_target",
-                        0, 2, vars, NULL, 0, 0, 0);
-    }
-    return result;
-}
-
-int kmp_get_affinity_max_proc_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "kmp_get_affinity_max_proc_target");
-}
-
-int kmp_set_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    int result = 1;
-
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[3] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(int);
-        vars[0].count = 1;
-        vars[0].ptr = &proc;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_inout;
-        vars[1].size = sizeof(kmp_affinity_mask_target_t);
-        vars[1].count = 1;
-        vars[1].ptr = mask;
-
-        vars[2].type.src = c_data;
-        vars[2].type.dst = c_data;
-        vars[2].direction.bits = c_parameter_out;
-        vars[2].size = sizeof(int);
-        vars[2].count = 1;
-        vars[2].ptr = &result;
-
-        OFFLOAD_OFFLOAD(ofld, "kmp_set_affinity_mask_proc_target",
-                        0, 3, vars, NULL, 0, 0, 0);
-    }
-    return result;
-}
-
-int kmp_unset_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    int result = 1;
-
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[3] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(int);
-        vars[0].count = 1;
-        vars[0].ptr = &proc;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_inout;
-        vars[1].size = sizeof(kmp_affinity_mask_target_t);
-        vars[1].count = 1;
-        vars[1].ptr = mask;
-
-        vars[2].type.src = c_data;
-        vars[2].type.dst = c_data;
-        vars[2].direction.bits = c_parameter_out;
-        vars[2].size = sizeof(int);
-        vars[2].count = 1;
-        vars[2].ptr = &result;
-
-        OFFLOAD_OFFLOAD(ofld, "kmp_unset_affinity_mask_proc_target",
-                        0, 3, vars, NULL, 0, 0, 0);
-    }
-    return result;
-}
-
-int kmp_get_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    int result = 1;
-
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[3] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(int);
-        vars[0].count = 1;
-        vars[0].ptr = &proc;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_in;
-        vars[1].size = sizeof(kmp_affinity_mask_target_t);
-        vars[1].count = 1;
-        vars[1].ptr = mask;
-
-        vars[2].type.src = c_data;
-        vars[2].type.dst = c_data;
-        vars[2].direction.bits = c_parameter_out;
-        vars[2].size = sizeof(int);
-        vars[2].count = 1;
-        vars[2].ptr = &result;
-
-        OFFLOAD_OFFLOAD(ofld, "kmp_get_affinity_mask_proc_target",
-                        0, 3, vars, NULL, 0, 0, 0);
-    }
-    return result;
-}
diff --git a/offload/src/offload_omp_target.cpp b/offload/src/offload_omp_target.cpp
deleted file mode 100644
index 1f2052a..0000000
--- a/offload/src/offload_omp_target.cpp
+++ /dev/null
@@ -1,1021 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include <omp.h>
-#include "offload.h"
-#include "compiler_if_target.h"
-
-// OpenMP API
-
-void omp_set_default_device(int num)
-{
-}
-
-int omp_get_default_device(void)
-{
-    return mic_index;
-}
-
-int omp_get_num_devices()
-{
-    return mic_engines_total;
-}
-
-// OpenMP API wrappers
-
-static void omp_send_int_to_host(
-    void *ofld_,
-    int setting
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_out;
-    vars[0].ptr = &setting;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-static int omp_get_int_from_host(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    int setting;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &setting;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    OFFLOAD_TARGET_LEAVE(ofld);
-
-    return setting;
-}
-
-void omp_set_num_threads_lrb(
-    void *ofld
-)
-{
-    int num_threads;
-
-    num_threads = omp_get_int_from_host(ofld);
-    omp_set_num_threads(num_threads);
-}
-
-void omp_get_max_threads_lrb(
-    void *ofld
-)
-{
-    int num_threads;
-
-    num_threads = omp_get_max_threads();
-    omp_send_int_to_host(ofld, num_threads);
-}
-
-void omp_get_num_procs_lrb(
-    void *ofld
-)
-{
-    int num_procs;
-
-    num_procs = omp_get_num_procs();
-    omp_send_int_to_host(ofld, num_procs);
-}
-
-void omp_set_dynamic_lrb(
-    void *ofld
-)
-{
-    int dynamic;
-
-    dynamic = omp_get_int_from_host(ofld);
-    omp_set_dynamic(dynamic);
-}
-
-void omp_get_dynamic_lrb(
-    void *ofld
-)
-{
-    int dynamic;
-
-    dynamic = omp_get_dynamic();
-    omp_send_int_to_host(ofld, dynamic);
-}
-
-void omp_set_nested_lrb(
-    void *ofld
-)
-{
-    int nested;
-
-    nested = omp_get_int_from_host(ofld);
-    omp_set_nested(nested);
-}
-
-void omp_get_nested_lrb(
-    void *ofld
-)
-{
-    int nested;
-
-    nested = omp_get_nested();
-    omp_send_int_to_host(ofld, nested);
-}
-
-void omp_set_schedule_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[2] = {0};
-    omp_sched_t kind;
-    int modifier;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &kind;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_in;
-    vars[1].ptr = &modifier;
-
-    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
-    omp_set_schedule(kind, modifier);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_get_schedule_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[2] = {0};
-    omp_sched_t kind;
-    int modifier;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_out;
-    vars[0].ptr = &kind;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_out;
-    vars[1].ptr = &modifier;
-
-    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
-    omp_get_schedule(&kind, &modifier);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-// lock API functions
-
-void omp_init_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    omp_lock_target_t lock;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_out;
-    vars[0].ptr = &lock;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    omp_init_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_destroy_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    omp_lock_target_t lock;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &lock;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    omp_destroy_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_set_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    omp_lock_target_t lock;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_inout;
-    vars[0].ptr = &lock;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    omp_set_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_unset_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    omp_lock_target_t lock;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_inout;
-    vars[0].ptr = &lock;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    omp_unset_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_test_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[2] = {0};
-    omp_lock_target_t lock;
-    int result;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_inout;
-    vars[0].ptr = &lock;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_out;
-    vars[1].ptr = &result;
-
-    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
-    result = omp_test_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-// nested lock API functions
-
-void omp_init_nest_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    omp_nest_lock_target_t lock;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_out;
-    vars[0].ptr = &lock;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    omp_init_nest_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_destroy_nest_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    omp_nest_lock_target_t lock;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &lock;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    omp_destroy_nest_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_set_nest_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    omp_nest_lock_target_t lock;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_inout;
-    vars[0].ptr = &lock;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    omp_set_nest_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_unset_nest_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    omp_nest_lock_target_t lock;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_inout;
-    vars[0].ptr = &lock;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    omp_unset_nest_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_test_nest_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[2] = {0};
-    omp_nest_lock_target_t lock;
-    int result;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_inout;
-    vars[0].ptr = &lock;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_out;
-    vars[1].ptr = &result;
-
-    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
-    result = omp_test_nest_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-// kmp API functions
-
-void kmp_set_stacksize_lrb(
-    void *ofld
-)
-{
-    int size;
-
-    size = omp_get_int_from_host(ofld);
-    kmp_set_stacksize(size);
-}
-
-void kmp_get_stacksize_lrb(
-    void *ofld
-)
-{
-    int size;
-
-    size = kmp_get_stacksize();
-    omp_send_int_to_host(ofld, size);
-}
-
-void kmp_set_stacksize_s_lrb(
-    void *ofld
-)
-{
-    int size;
-
-    size = omp_get_int_from_host(ofld);
-    kmp_set_stacksize_s(size);
-}
-
-void kmp_get_stacksize_s_lrb(
-    void *ofld
-)
-{
-    int size;
-
-    size = kmp_get_stacksize_s();
-    omp_send_int_to_host(ofld, size);
-}
-
-void kmp_set_blocktime_lrb(
-    void *ofld
-)
-{
-    int time;
-
-    time = omp_get_int_from_host(ofld);
-    kmp_set_blocktime(time);
-}
-
-void kmp_get_blocktime_lrb(
-    void *ofld
-)
-{
-    int time;
-
-    time = kmp_get_blocktime();
-    omp_send_int_to_host(ofld, time);
-}
-
-void kmp_set_library_serial_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-
-    OFFLOAD_TARGET_ENTER(ofld, 0, 0, 0);
-    kmp_set_library_serial();
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_set_library_turnaround_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-
-    OFFLOAD_TARGET_ENTER(ofld, 0, 0, 0);
-    kmp_set_library_turnaround();
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_set_library_throughput_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-
-    OFFLOAD_TARGET_ENTER(ofld, 0, 0, 0);
-    kmp_set_library_throughput();
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_set_library_lrb(
-    void *ofld
-)
-{
-    int mode;
-
-    mode = omp_get_int_from_host(ofld);
-    kmp_set_library(mode);
-}
-
-void kmp_get_library_lrb(
-    void *ofld
-)
-{
-    int mode;
-
-    mode = kmp_get_library();
-    omp_send_int_to_host(ofld, mode);
-}
-
-void kmp_set_defaults_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    char *defaults = 0;
-
-    vars[0].type.src = c_string_ptr;
-    vars[0].type.dst = c_string_ptr;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &defaults;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    kmp_set_defaults(defaults);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-// affinity API functions
-
-void kmp_create_affinity_mask_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    kmp_affinity_mask_target_t mask;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_out;
-    vars[0].ptr = &mask;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    kmp_create_affinity_mask(&mask.mask);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_destroy_affinity_mask_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    kmp_affinity_mask_target_t mask;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &mask;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    kmp_destroy_affinity_mask(&mask.mask);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_set_affinity_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[2] = {0};
-    kmp_affinity_mask_target_t mask;
-    int result;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &mask;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_out;
-    vars[1].ptr = &result;
-
-    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
-    result = kmp_set_affinity(&mask.mask);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_get_affinity_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[2] = {0};
-    kmp_affinity_mask_target_t mask;
-    int result;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_inout;
-    vars[0].ptr = &mask;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_out;
-    vars[1].ptr = &result;
-
-    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
-    result = kmp_get_affinity(&mask.mask);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_get_affinity_max_proc_lrb(
-    void *ofld
-)
-{
-    int max_proc;
-
-    max_proc = kmp_get_affinity_max_proc();
-    omp_send_int_to_host(ofld, max_proc);
-}
-
-void kmp_set_affinity_mask_proc_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[3] = {0};
-    kmp_affinity_mask_target_t mask;
-    int proc, result;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &proc;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_inout;
-    vars[1].ptr = &mask;
-
-    vars[2].type.src = c_data;
-    vars[2].type.dst = c_data;
-    vars[2].direction.bits = c_parameter_out;
-    vars[2].ptr = &result;
-
-    OFFLOAD_TARGET_ENTER(ofld, 3, vars, NULL);
-    result = kmp_set_affinity_mask_proc(proc, &mask.mask);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_unset_affinity_mask_proc_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[3] = {0};
-    kmp_affinity_mask_target_t mask;
-    int proc, result;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &proc;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_inout;
-    vars[1].ptr = &mask;
-
-    vars[2].type.src = c_data;
-    vars[2].type.dst = c_data;
-    vars[2].direction.bits = c_parameter_out;
-    vars[2].ptr = &result;
-
-    OFFLOAD_TARGET_ENTER(ofld, 3, vars, NULL);
-    result = kmp_unset_affinity_mask_proc(proc, &mask.mask);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_get_affinity_mask_proc_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[3] = {0};
-    kmp_affinity_mask_target_t mask;
-    int proc, result;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &proc;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_in;
-    vars[1].ptr = &mask;
-
-    vars[2].type.src = c_data;
-    vars[2].type.dst = c_data;
-    vars[2].direction.bits = c_parameter_out;
-    vars[2].ptr = &result;
-
-    OFFLOAD_TARGET_ENTER(ofld, 3, vars, NULL);
-    result = kmp_get_affinity_mask_proc(proc, &mask.mask);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-// Target-side stubs for the host functions (to avoid unresolveds)
-// These are needed for the offloadm table
-
-void omp_set_num_threads_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-)
-{
-}
-
-int omp_get_max_threads_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-int omp_get_num_procs_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-void omp_set_dynamic_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-)
-{
-}
-
-int omp_get_dynamic_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-void omp_set_nested_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-)
-{
-}
-
-int omp_get_nested_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-void omp_set_schedule_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_sched_t kind,
-    int modifier
-)
-{
-}
-
-void omp_get_schedule_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_sched_t *kind,
-    int *modifier
-)
-{
-}
-
-void omp_init_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-}
-
-void omp_destroy_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-}
-
-void omp_set_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-}
-
-void omp_unset_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-}
-
-int omp_test_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-    return 0;
-}
-
-void omp_init_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-}
-
-void omp_destroy_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-}
-
-void omp_set_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-}
-
-void omp_unset_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-}
-
-int omp_test_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-    return 0;
-}
-
-void kmp_set_stacksize_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int size
-)
-{
-}
-
-int kmp_get_stacksize_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-void kmp_set_stacksize_s_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    size_t size
-)
-{
-}
-
-size_t kmp_get_stacksize_s_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-void kmp_set_blocktime_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int time
-)
-{
-}
-
-int kmp_get_blocktime_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-void kmp_set_library_serial_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-}
-
-void kmp_set_library_turnaround_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-}
-
-void kmp_set_library_throughput_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-}
-
-void kmp_set_library_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int mode
-)
-{
-}
-
-int kmp_get_library_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-void kmp_set_defaults_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    char const *defaults
-)
-{
-}
-
-void kmp_create_affinity_mask_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-)
-{
-}
-
-void kmp_destroy_affinity_mask_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-)
-{
-}
-
-int kmp_set_affinity_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    return 0;
-}
-
-int kmp_get_affinity_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    return 0;
-}
-
-int kmp_get_affinity_max_proc_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-int kmp_set_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    return 0;
-}
-
-int kmp_unset_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    return 0;
-}
-
-int kmp_get_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    return 0;
-}
diff --git a/offload/src/offload_orsl.cpp b/offload/src/offload_orsl.cpp
deleted file mode 100644
index 6162f8a..0000000
--- a/offload/src/offload_orsl.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_orsl.h"
-#include <stdlib.h>
-#include "offload_host.h"
-#include "orsl-lite/include/orsl-lite.h"
-
-namespace ORSL {
-
-static bool            is_enabled = false;
-static const ORSLTag   my_tag = "Offload";
-
-void init()
-{
-    const char *env_var = getenv("OFFLOAD_ENABLE_ORSL");
-    if (env_var != 0 && *env_var != '\0') {
-        int64_t new_val;
-        if (__offload_parse_int_string(env_var, new_val)) {
-            is_enabled = new_val;
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
-                             "OFFLOAD_ENABLE_ORSL");
-        }
-    }
-
-    if (is_enabled) {
-        OFFLOAD_DEBUG_TRACE(2, "ORSL is enabled\n");
-    }
-    else {
-        OFFLOAD_DEBUG_TRACE(2, "ORSL is disabled\n");
-    }
-}
-
-bool reserve(int device)
-{
-    if (is_enabled) {
-        int pnum = mic_engines[device].get_physical_index();
-        ORSLBusySet bset;
-
-        bset.type = BUSY_SET_FULL;
-        if (ORSLReserve(1, &pnum, &bset, my_tag) != 0) {
-            return false;
-        }
-    }
-    return true;
-}
-
-bool try_reserve(int device)
-{
-    if (is_enabled) {
-        int pnum = mic_engines[device].get_physical_index();
-        ORSLBusySet bset;
-
-        bset.type = BUSY_SET_FULL;
-        if (ORSLTryReserve(1, &pnum, &bset, my_tag) != 0) {
-            return false;
-        }
-    }
-    return true;
-}
-
-void release(int device)
-{
-    if (is_enabled) {
-        int pnum = mic_engines[device].get_physical_index();
-        ORSLBusySet bset;
-
-        bset.type = BUSY_SET_FULL;
-        if (ORSLRelease(1, &pnum, &bset, my_tag) != 0) {
-            // should never get here
-        }
-    }
-}
-
-} // namespace ORSL
diff --git a/offload/src/offload_orsl.h b/offload/src/offload_orsl.h
deleted file mode 100644
index cdb86f9..0000000
--- a/offload/src/offload_orsl.h
+++ /dev/null
@@ -1,25 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef OFFLOAD_ORSL_H_INCLUDED
-#define OFFLOAD_ORSL_H_INCLUDED
-
-// ORSL interface
-namespace ORSL {
-
-extern void init();
-
-extern bool reserve(int device);
-extern bool try_reserve(int device);
-extern void release(int device);
-
-} // namespace ORSL
-
-#endif // OFFLOAD_ORSL_H_INCLUDED
diff --git a/offload/src/offload_table.cpp b/offload/src/offload_table.cpp
deleted file mode 100644
index cf165df..0000000
--- a/offload/src/offload_table.cpp
+++ /dev/null
@@ -1,375 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_table.h"
-#include "offload_common.h"
-
-#if !HOST_LIBRARY
-// Predefined offload entries
-extern void omp_set_num_threads_lrb(void*);
-extern void omp_get_max_threads_lrb(void*);
-extern void omp_get_num_procs_lrb(void*);
-extern void omp_set_dynamic_lrb(void*);
-extern void omp_get_dynamic_lrb(void*);
-extern void omp_set_nested_lrb(void*);
-extern void omp_get_nested_lrb(void*);
-extern void omp_set_schedule_lrb(void*);
-extern void omp_get_schedule_lrb(void*);
-
-extern void omp_init_lock_lrb(void*);
-extern void omp_destroy_lock_lrb(void*);
-extern void omp_set_lock_lrb(void*);
-extern void omp_unset_lock_lrb(void*);
-extern void omp_test_lock_lrb(void*);
-
-extern void omp_init_nest_lock_lrb(void*);
-extern void omp_destroy_nest_lock_lrb(void*);
-extern void omp_set_nest_lock_lrb(void*);
-extern void omp_unset_nest_lock_lrb(void*);
-extern void omp_test_nest_lock_lrb(void*);
-
-extern void kmp_set_stacksize_lrb(void*);
-extern void kmp_get_stacksize_lrb(void*);
-extern void kmp_set_stacksize_s_lrb(void*);
-extern void kmp_get_stacksize_s_lrb(void*);
-extern void kmp_set_blocktime_lrb(void*);
-extern void kmp_get_blocktime_lrb(void*);
-extern void kmp_set_library_serial_lrb(void*);
-extern void kmp_set_library_turnaround_lrb(void*);
-extern void kmp_set_library_throughput_lrb(void*);
-extern void kmp_set_library_lrb(void*);
-extern void kmp_get_library_lrb(void*);
-extern void kmp_set_defaults_lrb(void*);
-
-extern void kmp_create_affinity_mask_lrb(void*);
-extern void kmp_destroy_affinity_mask_lrb(void*);
-extern void kmp_set_affinity_lrb(void*);
-extern void kmp_get_affinity_lrb(void*);
-extern void kmp_get_affinity_max_proc_lrb(void*);
-extern void kmp_set_affinity_mask_proc_lrb(void*);
-extern void kmp_unset_affinity_mask_proc_lrb(void*);
-extern void kmp_get_affinity_mask_proc_lrb(void*);
-
-// Predefined entries on the target side
-static FuncTable::Entry predefined_entries[] = {
-    "omp_set_num_threads_target",
-    (void*) &omp_set_num_threads_lrb,
-    "omp_get_max_threads_target",
-    (void*) &omp_get_max_threads_lrb,
-    "omp_get_num_procs_target",
-    (void*) &omp_get_num_procs_lrb,
-    "omp_set_dynamic_target",
-    (void*) &omp_set_dynamic_lrb,
-    "omp_get_dynamic_target",
-    (void*) &omp_get_dynamic_lrb,
-    "omp_set_nested_target",
-    (void*) &omp_set_nested_lrb,
-    "omp_get_nested_target",
-    (void*) &omp_get_nested_lrb,
-    "omp_set_schedule_target",
-    (void*) &omp_set_schedule_lrb,
-    "omp_get_schedule_target",
-    (void*) &omp_get_schedule_lrb,
-
-    "omp_init_lock_target",
-    (void*) &omp_init_lock_lrb,
-    "omp_destroy_lock_target",
-    (void*) &omp_destroy_lock_lrb,
-    "omp_set_lock_target",
-    (void*) &omp_set_lock_lrb,
-    "omp_unset_lock_target",
-    (void*) &omp_unset_lock_lrb,
-    "omp_test_lock_target",
-    (void*) &omp_test_lock_lrb,
-
-    "omp_init_nest_lock_target",
-    (void*) &omp_init_nest_lock_lrb,
-    "omp_destroy_nest_lock_target",
-    (void*) &omp_destroy_nest_lock_lrb,
-    "omp_set_nest_lock_target",
-    (void*) &omp_set_nest_lock_lrb,
-    "omp_unset_nest_lock_target",
-    (void*) &omp_unset_nest_lock_lrb,
-    "omp_test_nest_lock_target",
-    (void*) &omp_test_nest_lock_lrb,
-
-    "kmp_set_stacksize_target",
-    (void*) &kmp_set_stacksize_lrb,
-    "kmp_get_stacksize_target",
-    (void*) &kmp_get_stacksize_lrb,
-    "kmp_set_stacksize_s_target",
-    (void*) &kmp_set_stacksize_s_lrb,
-    "kmp_get_stacksize_s_target",
-    (void*) &kmp_get_stacksize_s_lrb,
-    "kmp_set_blocktime_target",
-    (void*) &kmp_set_blocktime_lrb,
-    "kmp_get_blocktime_target",
-    (void*) &kmp_get_blocktime_lrb,
-    "kmp_set_library_serial_target",
-    (void*) &kmp_set_library_serial_lrb,
-    "kmp_set_library_turnaround_target",
-    (void*) &kmp_set_library_turnaround_lrb,
-    "kmp_set_library_throughput_target",
-    (void*) &kmp_set_library_throughput_lrb,
-    "kmp_set_library_target",
-    (void*) &kmp_set_library_lrb,
-    "kmp_get_library_target",
-    (void*) &kmp_get_library_lrb,
-    "kmp_set_defaults_target",
-    (void*) &kmp_set_defaults_lrb,
-
-    "kmp_create_affinity_mask_target",
-    (void*) &kmp_create_affinity_mask_lrb,
-    "kmp_destroy_affinity_mask_target",
-    (void*) &kmp_destroy_affinity_mask_lrb,
-    "kmp_set_affinity_target",
-    (void*) &kmp_set_affinity_lrb,
-    "kmp_get_affinity_target",
-    (void*) &kmp_get_affinity_lrb,
-    "kmp_get_affinity_max_proc_target",
-    (void*) &kmp_get_affinity_max_proc_lrb,
-    "kmp_set_affinity_mask_proc_target",
-    (void*) &kmp_set_affinity_mask_proc_lrb,
-    "kmp_unset_affinity_mask_proc_target",
-    (void*) &kmp_unset_affinity_mask_proc_lrb,
-    "kmp_get_affinity_mask_proc_target",
-    (void*) &kmp_get_affinity_mask_proc_lrb,
-
-    (const char*) -1,
-    (void*) -1
-};
-
-static FuncList::Node predefined_table = {
-    { predefined_entries, -1 },
-    0, 0
-};
-
-// Entry table
-FuncList __offload_entries(&predefined_table);
-#else
-FuncList __offload_entries;
-#endif // !HOST_LIBRARY
-
-// Function table. No predefined entries.
-FuncList __offload_funcs;
-
-// Var table
-VarList  __offload_vars;
-
-// Given the function name returns the associtated function pointer
-const void* FuncList::find_addr(const char *name)
-{
-    const void* func = 0;
-
-    m_lock.lock();
-
-    for (Node *n = m_head; n != 0; n = n->next) {
-        for (const Table::Entry *e = n->table.entries;
-             e->name != (const char*) -1; e++) {
-            if (e->name != 0 && strcmp(e->name, name) == 0) {
-                func = e->func;
-                break;
-            }
-        }
-    }
-
-    m_lock.unlock();
-
-    return func;
-}
-
-// Given the function pointer returns the associtated function name
-const char* FuncList::find_name(const void *func)
-{
-    const char* name = 0;
-
-    m_lock.lock();
-
-    for (Node *n = m_head; n != 0; n = n->next) {
-        for (const Table::Entry *e = n->table.entries;
-             e->name != (const char*) -1; e++) {
-            if (e->func == func) {
-                name = e->name;
-                break;
-            }
-        }
-    }
-
-    m_lock.unlock();
-
-    return name;
-}
-
-// Returns max name length from all tables
-int64_t FuncList::max_name_length(void)
-{
-    if (m_max_name_len < 0) {
-        m_lock.lock();
-
-        m_max_name_len = 0;
-        for (Node *n = m_head; n != 0; n = n->next) {
-            if (n->table.max_name_len < 0) {
-                n->table.max_name_len = 0;
-
-                // calculate max name length in a single table
-                for (const Table::Entry *e = n->table.entries;
-                     e->name != (const char*) -1; e++) {
-                    if (e->name != 0) {
-                        size_t len = strlen(e->name) + 1;
-                        if (n->table.max_name_len < len) {
-                            n->table.max_name_len = len;
-                        }
-                    }
-                }
-            }
-
-            // select max from all tables
-            if (m_max_name_len < n->table.max_name_len) {
-                m_max_name_len = n->table.max_name_len;
-            }
-        }
-
-        m_lock.unlock();
-    }
-    return m_max_name_len;
-}
-
-// Debugging dump
-void FuncList::dump(void)
-{
-    OFFLOAD_DEBUG_TRACE(2, "Function table:\n");
-
-    m_lock.lock();
-
-    for (Node *n = m_head; n != 0; n = n->next) {
-        for (const Table::Entry *e = n->table.entries;
-             e->name != (const char*) -1; e++) {
-            if (e->name != 0) {
-                OFFLOAD_DEBUG_TRACE(2, "%p %s\n", e->func, e->name);
-            }
-        }
-    }
-
-    m_lock.unlock();
-}
-
-// Debugging dump
-void VarList::dump(void)
-{
-    OFFLOAD_DEBUG_TRACE(2, "Var table:\n");
-
-    m_lock.lock();
-
-    for (Node *n = m_head; n != 0; n = n->next) {
-        for (const Table::Entry *e = n->table.entries;
-             e->name != (const char*) -1; e++) {
-            if (e->name != 0) {
-#if HOST_LIBRARY
-                OFFLOAD_DEBUG_TRACE(2, "%s %p %ld\n", e->name, e->addr,
-                                    e->size);
-#else  // HOST_LIBRARY
-                OFFLOAD_DEBUG_TRACE(2, "%s %p\n", e->name, e->addr);
-#endif // HOST_LIBRARY
-            }
-        }
-    }
-
-    m_lock.unlock();
-}
-
-//
-int64_t VarList::table_size(int64_t &nelems)
-{
-    int64_t length = 0;
-
-    nelems = 0;
-
-    // calculate string table size and number of elements
-    for (Node *n = m_head; n != 0; n = n->next) {
-        for (const Table::Entry *e = n->table.entries;
-             e->name != (const char*) -1; e++) {
-            if (e->name != 0) {
-                length += strlen(e->name) + 1;
-                nelems++;
-            }
-        }
-    }
-
-    return nelems * sizeof(BufEntry) + length;
-}
-
-// copy table to the gven buffer
-void VarList::table_copy(void *buf, int64_t nelems)
-{
-    BufEntry* elems = static_cast<BufEntry*>(buf);
-    char*     names = reinterpret_cast<char*>(elems + nelems);
-
-    // copy entries to buffer
-    for (Node *n = m_head; n != 0; n = n->next) {
-        for (const Table::Entry *e = n->table.entries;
-             e->name != (const char*) -1; e++) {
-            if (e->name != 0) {
-                // name field contains offset to the name from the beginning
-                // of the buffer
-                elems->name = names - static_cast<char*>(buf);
-                elems->addr = reinterpret_cast<intptr_t>(e->addr);
-
-                // copy name to string table
-                const char *name = e->name;
-                while ((*names++ = *name++) != '\0');
-
-                elems++;
-            }
-        }
-    }
-}
-
-// patch name offsets in a buffer
-void VarList::table_patch_names(void *buf, int64_t nelems)
-{
-    BufEntry* elems = static_cast<BufEntry*>(buf);
-    for (int i = 0; i < nelems; i++) {
-        elems[i].name += reinterpret_cast<intptr_t>(buf);
-    }
-}
-
-// Adds given list element to the global lookup table list
-extern "C" void __offload_register_tables(
-    FuncList::Node *entry_table,
-    FuncList::Node *func_table,
-    VarList::Node *var_table
-)
-{
-    OFFLOAD_DEBUG_TRACE(2, "Registering offload function entry table %p\n",
-                           entry_table);
-    __offload_entries.add_table(entry_table);
-
-    OFFLOAD_DEBUG_TRACE(2, "Registering function table %p\n", func_table);
-    __offload_funcs.add_table(func_table);
-
-    OFFLOAD_DEBUG_TRACE(2, "Registering var table %p\n", var_table);
-    __offload_vars.add_table(var_table);
-}
-
-// Removes given list element from the global lookup table list
-extern "C" void __offload_unregister_tables(
-    FuncList::Node *entry_table,
-    FuncList::Node *func_table,
-    VarList::Node *var_table
-)
-{
-    __offload_entries.remove_table(entry_table);
-
-    OFFLOAD_DEBUG_TRACE(2, "Unregistering function table %p\n", func_table);
-    __offload_funcs.remove_table(func_table);
-
-    OFFLOAD_DEBUG_TRACE(2, "Unregistering var table %p\n", var_table);
-    __offload_vars.remove_table(var_table);
-}
diff --git a/offload/src/offload_table.h b/offload/src/offload_table.h
deleted file mode 100644
index cfced3e..0000000
--- a/offload/src/offload_table.h
+++ /dev/null
@@ -1,301 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-/*! \file
-    \brief Function and Variable tables used by the runtime library
-*/
-
-#ifndef OFFLOAD_TABLE_H_INCLUDED
-#define OFFLOAD_TABLE_H_INCLUDED
-
-#include <iterator>
-#include "offload_util.h"
-
-// Template representing double linked list of tables
-template <typename T> class TableList {
-public:
-    // table type
-    typedef T Table;
-
-    // List node
-    struct Node {
-        Table   table;
-        Node*   prev;
-        Node*   next;
-    };
-
-public:
-    explicit TableList(Node *node = 0) : m_head(node) {}
-
-    void add_table(Node *node) {
-        m_lock.lock();
-
-        if (m_head != 0) {
-            node->next = m_head;
-            m_head->prev = node;
-        }
-        m_head = node;
-
-        m_lock.unlock();
-    }
-
-    void remove_table(Node *node) {
-        m_lock.lock();
-
-        if (node->next != 0) {
-            node->next->prev = node->prev;
-        }
-        if (node->prev != 0) {
-            node->prev->next = node->next;
-        }
-        if (m_head == node) {
-            m_head = node->next;
-        }
-
-        m_lock.unlock();
-    }
-
-protected:
-    Node*           m_head;
-    mutex_t         m_lock;
-};
-
-// Function lookup table.
-struct FuncTable {
-    //! Function table entry
-    /*! This table contains functions created from offload regions.   */
-    /*! Each entry consists of a pointer to the function's "key"
-        and the function address.                                     */
-    /*! Each shared library or executable may contain one such table. */
-    /*! The end of the table is marked with an entry whose name field
-        has value -1.                                                 */
-    struct Entry {
-        const char* name; //!< Name of the function
-        void*       func; //!< Address of the function
-    };
-
-    // entries
-    const Entry *entries;
-
-    // max name length
-    int64_t max_name_len;
-};
-
-// Function table
-class FuncList : public TableList<FuncTable> {
-public:
-    explicit FuncList(Node *node = 0) : TableList<Table>(node),
-                                        m_max_name_len(-1)
-    {}
-
-    // add table to the list
-    void add_table(Node *node) {
-        // recalculate max function name length
-        m_max_name_len = -1;
-
-        // add table
-        TableList<Table>::add_table(node);
-    }
-
-    // find function address for the given name
-    const void* find_addr(const char *name);
-
-    // find function name for the given address
-    const char* find_name(const void *addr);
-
-    // max name length from all tables in the list
-    int64_t max_name_length(void);
-
-    // debug dump
-    void dump(void);
-
-private:
-    // max name length within from all tables
-    int64_t m_max_name_len;
-};
-
-// Table entry for static variables
-struct VarTable {
-    //! Variable table entry
-    /*! This table contains statically allocated variables marked with
-        __declspec(target(mic) or #pragma omp declare target.           */
-    /*! Each entry consists of a pointer to the variable's "key",
-        the variable address and its size in bytes.                     */
-    /*! Because memory allocation is done from the host,
-        the MIC table does not need the size of the variable.           */
-    /*! Padding to make the table entry size a power of 2 is necessary
-        to avoid "holes" between table contributions from different object
-        files on Windows when debug information is specified with /Zi.  */
-    struct Entry {
-        const char* name; //!< Name of the variable
-        void*       addr; //!< Address of the variable
-
-#if HOST_LIBRARY
-        uint64_t    size;
-
-#ifdef TARGET_WINNT
-		// padding to make entry size a power of 2
-        uint64_t    padding;
-#endif // TARGET_WINNT
-#endif
-    };
-
-    // Table terminated by an entry with name == -1
-    const Entry *entries;
-};
-
-// List of var tables
-class VarList : public TableList<VarTable> {
-public:
-    VarList() : TableList<Table>()
-    {}
-
-    // debug dump
-    void dump();
-
-public:
-    // var table list iterator
-    class Iterator : public std::iterator<std::input_iterator_tag,
-                                          Table::Entry> {
-    public:
-        Iterator() : m_node(0), m_entry(0) {}
-
-        explicit Iterator(Node *node) {
-            new_node(node);
-        }
-
-        Iterator& operator++() {
-            if (m_entry != 0) {
-                m_entry++;
-                while (m_entry->name == 0) {
-                    m_entry++;
-                }
-                if (m_entry->name == reinterpret_cast<const char*>(-1)) {
-                    new_node(m_node->next);
-                }
-            }
-            return *this;
-        }
-
-        bool operator==(const Iterator &other) const {
-            return m_entry == other.m_entry;
-        }
-
-        bool operator!=(const Iterator &other) const {
-            return m_entry != other.m_entry;
-        }
-
-        const Table::Entry* operator*() const {
-            return m_entry;
-        }
-
-    private:
-        void new_node(Node *node) {
-            m_node = node;
-            m_entry = 0;
-            while (m_node != 0) {
-                m_entry = m_node->table.entries;
-                while (m_entry->name == 0) {
-                    m_entry++;
-                }
-                if (m_entry->name != reinterpret_cast<const char*>(-1)) {
-                    break;
-                }
-                m_node = m_node->next;
-                m_entry = 0;
-            }
-        }
-
-    private:
-        Node                *m_node;
-        const Table::Entry  *m_entry;
-    };
-
-    Iterator begin() const {
-        return Iterator(m_head);
-    }
-
-    Iterator end() const {
-        return Iterator();
-    }
-
-public:
-    // Entry representation in a copy buffer
-    struct BufEntry {
-        intptr_t name;
-        intptr_t addr;
-    };
-
-    // Calculate the number of elements in the table and
-    // returns the size of buffer for the table
-    int64_t table_size(int64_t &nelems);
-
-    // Copy table contents to given buffer. It is supposed to be large
-    // enough to hold all elements as string table.
-    void table_copy(void *buf, int64_t nelems);
-
-    // Patch name offsets in a table after it's been copied to other side
-    static void table_patch_names(void *buf, int64_t nelems);
-};
-
-extern FuncList __offload_entries;
-extern FuncList __offload_funcs;
-extern VarList  __offload_vars;
-
-// Section names where the lookup tables are stored
-#ifdef TARGET_WINNT
-#define OFFLOAD_ENTRY_TABLE_SECTION_START   ".OffloadEntryTable$a"
-#define OFFLOAD_ENTRY_TABLE_SECTION_END     ".OffloadEntryTable$z"
-
-#define OFFLOAD_FUNC_TABLE_SECTION_START    ".OffloadFuncTable$a"
-#define OFFLOAD_FUNC_TABLE_SECTION_END      ".OffloadFuncTable$z"
-
-#define OFFLOAD_VAR_TABLE_SECTION_START     ".OffloadVarTable$a"
-#define OFFLOAD_VAR_TABLE_SECTION_END       ".OffloadVarTable$z"
-
-#define OFFLOAD_CRTINIT_SECTION_START       ".CRT$XCT"
-
-#pragma section(OFFLOAD_CRTINIT_SECTION_START, read)
-
-#else  // TARGET_WINNT
-
-#define OFFLOAD_ENTRY_TABLE_SECTION_START   ".OffloadEntryTable."
-#define OFFLOAD_ENTRY_TABLE_SECTION_END     ".OffloadEntryTable."
-
-#define OFFLOAD_FUNC_TABLE_SECTION_START    ".OffloadFuncTable."
-#define OFFLOAD_FUNC_TABLE_SECTION_END      ".OffloadFuncTable."
-
-#define OFFLOAD_VAR_TABLE_SECTION_START     ".OffloadVarTable."
-#define OFFLOAD_VAR_TABLE_SECTION_END       ".OffloadVarTable."
-#endif // TARGET_WINNT
-
-#pragma section(OFFLOAD_ENTRY_TABLE_SECTION_START, read, write)
-#pragma section(OFFLOAD_ENTRY_TABLE_SECTION_END, read, write)
-
-#pragma section(OFFLOAD_FUNC_TABLE_SECTION_START, read, write)
-#pragma section(OFFLOAD_FUNC_TABLE_SECTION_END, read, write)
-
-#pragma section(OFFLOAD_VAR_TABLE_SECTION_START, read, write)
-#pragma section(OFFLOAD_VAR_TABLE_SECTION_END, read, write)
-
-
-// register/unregister given tables
-extern "C" void __offload_register_tables(
-    FuncList::Node *entry_table,
-    FuncList::Node *func_table,
-    VarList::Node *var_table
-);
-
-extern "C" void __offload_unregister_tables(
-    FuncList::Node *entry_table,
-    FuncList::Node *func_table,
-    VarList::Node *var_table
-);
-#endif  // OFFLOAD_TABLE_H_INCLUDED
diff --git a/offload/src/offload_target.cpp b/offload/src/offload_target.cpp
deleted file mode 100644
index cfc1b04..0000000
--- a/offload/src/offload_target.cpp
+++ /dev/null
@@ -1,754 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_target.h"
-#include <stdlib.h>
-#include <unistd.h>
-#ifdef SEP_SUPPORT
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#endif // SEP_SUPPORT
-#include <omp.h>
-#include <map>
-
-// typedef offload_func_with_parms.
-// Pointer to function that represents an offloaded entry point.
-// The parameters are a temporary fix for parameters on the stack.
-typedef void (*offload_func_with_parms)(void *);
-
-// Target console and file logging
-const char *prefix;
-int console_enabled = 0;
-int offload_report_level = 0;
-
-// Trace information
-static const char* vardesc_direction_as_string[] = {
-    "NOCOPY",
-    "IN",
-    "OUT",
-    "INOUT"
-};
-static const char* vardesc_type_as_string[] = {
-    "unknown",
-    "data",
-    "data_ptr",
-    "func_ptr",
-    "void_ptr",
-    "string_ptr",
-    "dv",
-    "dv_data",
-    "dv_data_slice",
-    "dv_ptr",
-    "dv_ptr_data",
-    "dv_ptr_data_slice",
-    "cean_var",
-    "cean_var_ptr",
-    "c_data_ptr_array"
-};
-
-int mic_index = -1;
-int mic_engines_total = -1;
-uint64_t mic_frequency = 0;
-int offload_number = 0;
-static std::map<void*, RefInfo*> ref_data;
-static mutex_t add_ref_lock;
-
-#ifdef SEP_SUPPORT
-static const char*  sep_monitor_env = "SEP_MONITOR";
-static bool         sep_monitor = false;
-static const char*  sep_device_env = "SEP_DEVICE";
-static const char*  sep_device =  "/dev/sep3.8/c";
-static int          sep_counter = 0;
-
-#define SEP_API_IOC_MAGIC   99
-#define SEP_IOCTL_PAUSE     _IO (SEP_API_IOC_MAGIC, 31)
-#define SEP_IOCTL_RESUME    _IO (SEP_API_IOC_MAGIC, 32)
-
-static void add_ref_count(void * buf, bool created)
-{
-    mutex_locker_t locker(add_ref_lock);
-    RefInfo * info = ref_data[buf];
-
-    if (info) {
-        info->count++;
-    }
-    else {
-        info = new RefInfo((int)created,(long)1);
-    }
-    info->is_added |= created;
-    ref_data[buf] = info;
-}
-
-static void BufReleaseRef(void * buf)
-{
-    mutex_locker_t locker(add_ref_lock);
-    RefInfo * info = ref_data[buf];
-
-    if (info) {
-        --info->count;
-        if (info->count == 0 && info->is_added) {
-            BufferReleaseRef(buf);
-            info->is_added = 0;
-        }
-    }
-}
-
-static int VTPauseSampling(void)
-{
-    int ret = -1;
-    int handle = open(sep_device, O_RDWR);
-    if (handle > 0) {
-        ret = ioctl(handle, SEP_IOCTL_PAUSE);
-        close(handle);
-    }
-    return ret;
-}
-
-static int VTResumeSampling(void)
-{
-    int ret = -1;
-    int handle = open(sep_device, O_RDWR);
-    if (handle > 0) {
-        ret = ioctl(handle, SEP_IOCTL_RESUME);
-        close(handle);
-    }
-    return ret;
-}
-#endif // SEP_SUPPORT
-
-void OffloadDescriptor::offload(
-    uint32_t  buffer_count,
-    void**    buffers,
-    void*     misc_data,
-    uint16_t  misc_data_len,
-    void*     return_data,
-    uint16_t  return_data_len
-)
-{
-    FunctionDescriptor *func = (FunctionDescriptor*) misc_data;
-    const char *name = func->data;
-    OffloadDescriptor ofld;
-    char *in_data = 0;
-    char *out_data = 0;
-    char *timer_data = 0;
-
-    console_enabled = func->console_enabled;
-    timer_enabled = func->timer_enabled;
-    offload_report_level = func->offload_report_level;
-    offload_number = func->offload_number;
-    ofld.set_offload_number(func->offload_number);
-
-#ifdef SEP_SUPPORT
-    if (sep_monitor) {
-        if (__sync_fetch_and_add(&sep_counter, 1) == 0) {
-            OFFLOAD_DEBUG_TRACE(2, "VTResumeSampling\n");
-            VTResumeSampling();
-        }
-    }
-#endif // SEP_SUPPORT
-
-    OFFLOAD_DEBUG_TRACE_1(2, ofld.get_offload_number(),
-                          c_offload_start_target_func,
-                          "Offload \"%s\" started\n", name);
-
-    // initialize timer data
-    OFFLOAD_TIMER_INIT();
-
-    OFFLOAD_TIMER_START(c_offload_target_total_time);
-
-    OFFLOAD_TIMER_START(c_offload_target_descriptor_setup);
-
-    // get input/output buffer addresses
-    if (func->in_datalen > 0 || func->out_datalen > 0) {
-        if (func->data_offset != 0) {
-            in_data = (char*) misc_data + func->data_offset;
-            out_data = (char*) return_data;
-        }
-        else {
-            char *inout_buf = (char*) buffers[--buffer_count];
-            in_data = inout_buf;
-            out_data = inout_buf;
-        }
-    }
-
-    // assign variable descriptors
-    ofld.m_vars_total = func->vars_num;
-    if (ofld.m_vars_total > 0) {
-        uint64_t var_data_len = ofld.m_vars_total * sizeof(VarDesc);
-
-        ofld.m_vars = (VarDesc*) malloc(var_data_len);
-        memcpy(ofld.m_vars, in_data, var_data_len);
-
-        in_data += var_data_len;
-        func->in_datalen -= var_data_len;
-    }
-
-    // timer data
-    if (func->timer_enabled) {
-        uint64_t timer_data_len = OFFLOAD_TIMER_DATALEN();
-
-        timer_data = out_data;
-        out_data += timer_data_len;
-        func->out_datalen -= timer_data_len;
-    }
-
-    // init Marshallers
-    ofld.m_in.init_buffer(in_data, func->in_datalen);
-    ofld.m_out.init_buffer(out_data, func->out_datalen);
-
-    // copy buffers to offload descriptor
-    std::copy(buffers, buffers + buffer_count,
-              std::back_inserter(ofld.m_buffers));
-
-    OFFLOAD_TIMER_STOP(c_offload_target_descriptor_setup);
-
-    // find offload entry address
-    OFFLOAD_TIMER_START(c_offload_target_func_lookup);
-
-    offload_func_with_parms entry = (offload_func_with_parms)
-        __offload_entries.find_addr(name);
-
-    if (entry == NULL) {
-#if OFFLOAD_DEBUG > 0
-        if (console_enabled > 2) {
-            __offload_entries.dump();
-        }
-#endif
-        LIBOFFLOAD_ERROR(c_offload_descriptor_offload, name);
-        exit(1);
-    }
-
-    OFFLOAD_TIMER_STOP(c_offload_target_func_lookup);
-
-    OFFLOAD_TIMER_START(c_offload_target_func_time);
-
-    // execute offload entry
-    entry(&ofld);
-
-    OFFLOAD_TIMER_STOP(c_offload_target_func_time);
-
-    OFFLOAD_TIMER_STOP(c_offload_target_total_time);
-
-    // copy timer data to the buffer
-    OFFLOAD_TIMER_TARGET_DATA(timer_data);
-
-    OFFLOAD_DEBUG_TRACE(2, "Offload \"%s\" finished\n", name);
-
-#ifdef SEP_SUPPORT
-    if (sep_monitor) {
-        if (__sync_sub_and_fetch(&sep_counter, 1) == 0) {
-            OFFLOAD_DEBUG_TRACE(2, "VTPauseSampling\n");
-            VTPauseSampling();
-        }
-    }
-#endif // SEP_SUPPORT
-}
-
-void OffloadDescriptor::merge_var_descs(
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int vars_total
-)
-{
-    // number of variable descriptors received from host and generated
-    // locally should match
-    if (m_vars_total < vars_total) {
-        LIBOFFLOAD_ERROR(c_merge_var_descs1);
-        exit(1);
-    }
-
-    for (int i = 0; i < m_vars_total; i++) {
-        if (i < vars_total) {
-            // variable type must match
-            if (m_vars[i].type.bits != vars[i].type.bits) {
-                LIBOFFLOAD_ERROR(c_merge_var_descs2);
-                exit(1);
-            }
-
-            m_vars[i].ptr = vars[i].ptr;
-            m_vars[i].into = vars[i].into;
-
-            const char *var_sname = "";
-            if (vars2 != NULL) {
-                if (vars2[i].sname != NULL) {
-                    var_sname = vars2[i].sname;
-                }
-            }
-            OFFLOAD_DEBUG_TRACE_1(2, get_offload_number(), c_offload_var,
-                "   VarDesc %d, var=%s, %s, %s\n",
-                i, var_sname,
-                vardesc_direction_as_string[m_vars[i].direction.bits],
-                vardesc_type_as_string[m_vars[i].type.src]);
-            if (vars2 != NULL && vars2[i].dname != NULL) {
-                OFFLOAD_TRACE(2, "              into=%s, %s\n", vars2[i].dname,
-                    vardesc_type_as_string[m_vars[i].type.dst]);
-            }
-        }
-        OFFLOAD_TRACE(2,
-            "              type_src=%d, type_dstn=%d, direction=%d, "
-            "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, "
-            "offset=%lld, size=%lld, count/disp=%lld, ptr=%p into=%p\n",
-            m_vars[i].type.src,
-            m_vars[i].type.dst,
-            m_vars[i].direction.bits,
-            m_vars[i].alloc_if,
-            m_vars[i].free_if,
-            m_vars[i].align,
-            m_vars[i].mic_offset,
-            m_vars[i].flags.bits,
-            m_vars[i].offset,
-            m_vars[i].size,
-            m_vars[i].count,
-            m_vars[i].ptr,
-            m_vars[i].into);
-    }
-}
-
-void OffloadDescriptor::scatter_copyin_data()
-{
-    OFFLOAD_TIMER_START(c_offload_target_scatter_inputs);
-
-    OFFLOAD_DEBUG_TRACE(2, "IN  buffer @ %p size %lld\n",
-                        m_in.get_buffer_start(),
-                        m_in.get_buffer_size());
-    OFFLOAD_DEBUG_DUMP_BYTES(2, m_in.get_buffer_start(),
-                             m_in.get_buffer_size());
-
-    // receive data
-    for (int i = 0; i < m_vars_total; i++) {
-        bool src_is_for_mic = (m_vars[i].direction.out ||
-                               m_vars[i].into == NULL);
-        void** ptr_addr = src_is_for_mic ?
-                          static_cast<void**>(m_vars[i].ptr) :
-                          static_cast<void**>(m_vars[i].into);
-        int type = src_is_for_mic ? m_vars[i].type.src :
-                                    m_vars[i].type.dst;
-        bool is_static = src_is_for_mic ?
-                         m_vars[i].flags.is_static :
-                         m_vars[i].flags.is_static_dstn;
-        void *ptr = NULL;
-
-        if (m_vars[i].flags.alloc_disp) {
-            int64_t offset = 0;
-            m_in.receive_data(&offset, sizeof(offset));
-            m_vars[i].offset = -offset;
-        }
-        if (VAR_TYPE_IS_DV_DATA_SLICE(type) ||
-            VAR_TYPE_IS_DV_DATA(type)) {
-            ArrDesc *dvp = (type == c_dv_data_slice || type == c_dv_data)?
-                  reinterpret_cast<ArrDesc*>(ptr_addr) :
-                  *reinterpret_cast<ArrDesc**>(ptr_addr);
-            ptr_addr = reinterpret_cast<void**>(&dvp->Base);
-        }
-
-        // Set pointer values
-        switch (type) {
-            case c_data_ptr_array:
-                {
-                    int j = m_vars[i].ptr_arr_offset;
-                    int max_el = j + m_vars[i].count;
-                    char *dst_arr_ptr = (src_is_for_mic)?
-                        *(reinterpret_cast<char**>(m_vars[i].ptr)) :
-                        reinterpret_cast<char*>(m_vars[i].into);
-
-                    for (; j < max_el; j++) {
-                        if (src_is_for_mic) {
-                            m_vars[j].ptr =
-                                dst_arr_ptr + m_vars[j].ptr_arr_offset;
-                        }
-                        else {
-                            m_vars[j].into =
-                                dst_arr_ptr + m_vars[j].ptr_arr_offset;
-                        }
-                    }
-                }
-                break;
-            case c_data:
-            case c_void_ptr:
-            case c_cean_var:
-            case c_dv:
-                break;
-
-            case c_string_ptr:
-            case c_data_ptr:
-            case c_cean_var_ptr:
-            case c_dv_ptr:
-                if (m_vars[i].alloc_if) {
-                    void *buf;
-                    if (m_vars[i].flags.sink_addr) {
-                        m_in.receive_data(&buf, sizeof(buf));
-                    }
-                    else {
-                        buf = m_buffers.front();
-                        m_buffers.pop_front();
-                    }
-                    if (buf) {
-                        if (!is_static) {
-                            if (!m_vars[i].flags.sink_addr) {
-                                // increment buffer reference
-                                OFFLOAD_TIMER_START(c_offload_target_add_buffer_refs);
-                                BufferAddRef(buf);
-                                OFFLOAD_TIMER_STOP(c_offload_target_add_buffer_refs);
-                            }
-                            add_ref_count(buf, 0 == m_vars[i].flags.sink_addr);
-                        }
-                        ptr = static_cast<char*>(buf) +
-                                  m_vars[i].mic_offset +
-                                  (m_vars[i].flags.is_stack_buf ?
-                                   0 : m_vars[i].offset);
-                    }
-                    *ptr_addr = ptr;
-                }
-                else if (m_vars[i].flags.sink_addr) {
-                    void *buf;
-                    m_in.receive_data(&buf, sizeof(buf));
-                    void *ptr = static_cast<char*>(buf) +
-                                    m_vars[i].mic_offset +
-                                    (m_vars[i].flags.is_stack_buf ?
-                                     0 : m_vars[i].offset);
-                    *ptr_addr = ptr;
-                }
-                break;
-
-            case c_func_ptr:
-                break;
-
-            case c_dv_data:
-            case c_dv_ptr_data:
-            case c_dv_data_slice:
-            case c_dv_ptr_data_slice:
-                if (m_vars[i].alloc_if) {
-                    void *buf;
-                    if (m_vars[i].flags.sink_addr) {
-                        m_in.receive_data(&buf, sizeof(buf));
-                    }
-                    else {
-                        buf = m_buffers.front();
-                        m_buffers.pop_front();
-                    }
-                    if (buf) {
-                        if (!is_static) {
-                            if (!m_vars[i].flags.sink_addr) {
-                                // increment buffer reference
-                                OFFLOAD_TIMER_START(c_offload_target_add_buffer_refs);
-                                BufferAddRef(buf);
-                                OFFLOAD_TIMER_STOP(c_offload_target_add_buffer_refs);
-                            }
-                            add_ref_count(buf, 0 == m_vars[i].flags.sink_addr);
-                        }
-                        ptr = static_cast<char*>(buf) +
-                            m_vars[i].mic_offset + m_vars[i].offset;
-                    }
-                    *ptr_addr = ptr;
-                }
-                else if (m_vars[i].flags.sink_addr) {
-                    void *buf;
-                    m_in.receive_data(&buf, sizeof(buf));
-                    ptr = static_cast<char*>(buf) +
-                          m_vars[i].mic_offset + m_vars[i].offset;
-                    *ptr_addr = ptr;
-                }
-                break;
-
-            default:
-                LIBOFFLOAD_ERROR(c_unknown_var_type, type);
-                abort();
-        }
-        // Release obsolete buffers for stack of persistent objects
-        if (type = c_data_ptr &&
-            m_vars[i].flags.is_stack_buf &&
-            !m_vars[i].direction.bits &&
-            m_vars[i].alloc_if &&
-            m_vars[i].size != 0) {
-                for (int j=0; j < m_vars[i].size; j++) {
-                    void *buf;
-                    m_in.receive_data(&buf, sizeof(buf));
-                    BufferReleaseRef(buf);
-                    ref_data.erase(buf);
-                }
-        }
-        // Do copyin
-        switch (m_vars[i].type.dst) {
-            case c_data_ptr_array:
-                break;
-            case c_data:
-            case c_void_ptr:
-            case c_cean_var:
-                if (m_vars[i].direction.in &&
-                    !m_vars[i].flags.is_static_dstn) {
-                    int64_t size;
-                    int64_t disp;
-                    char* ptr = m_vars[i].into ?
-                                 static_cast<char*>(m_vars[i].into) :
-                                 static_cast<char*>(m_vars[i].ptr);
-                    if (m_vars[i].type.dst == c_cean_var) {
-                        m_in.receive_data((&size), sizeof(int64_t));
-                        m_in.receive_data((&disp), sizeof(int64_t));
-                    }
-                    else {
-                        size = m_vars[i].size;
-                        disp = 0;
-                    }
-                    m_in.receive_data(ptr + disp, size);
-                }
-                break;
-
-            case c_dv:
-                if (m_vars[i].direction.bits ||
-                    m_vars[i].alloc_if ||
-                    m_vars[i].free_if) {
-                    char* ptr = m_vars[i].into ?
-                                 static_cast<char*>(m_vars[i].into) :
-                                 static_cast<char*>(m_vars[i].ptr);
-                    m_in.receive_data(ptr + sizeof(uint64_t),
-                                      m_vars[i].size - sizeof(uint64_t));
-                }
-                break;
-
-            case c_string_ptr:
-            case c_data_ptr:
-            case c_cean_var_ptr:
-            case c_dv_ptr:
-            case c_dv_data:
-            case c_dv_ptr_data:
-            case c_dv_data_slice:
-            case c_dv_ptr_data_slice:
-                break;
-
-            case c_func_ptr:
-                if (m_vars[i].direction.in) {
-                    m_in.receive_func_ptr((const void**) m_vars[i].ptr);
-                }
-                break;
-
-            default:
-                LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.dst);
-                abort();
-        }
-    }
-
-    OFFLOAD_TRACE(1, "Total copyin data received from host: [%lld] bytes\n",
-                  m_in.get_tfr_size());
-
-    OFFLOAD_TIMER_STOP(c_offload_target_scatter_inputs);
-
-    OFFLOAD_TIMER_START(c_offload_target_compute);
-}
-
-void OffloadDescriptor::gather_copyout_data()
-{
-    OFFLOAD_TIMER_STOP(c_offload_target_compute);
-
-    OFFLOAD_TIMER_START(c_offload_target_gather_outputs);
-
-    for (int i = 0; i < m_vars_total; i++) {
-        bool src_is_for_mic = (m_vars[i].direction.out ||
-                               m_vars[i].into == NULL);
-
-        switch (m_vars[i].type.src) {
-            case c_data_ptr_array:
-                break;
-            case c_data:
-            case c_void_ptr:
-            case c_cean_var:
-                if (m_vars[i].direction.out &&
-                    !m_vars[i].flags.is_static) {
-                    m_out.send_data(
-                        static_cast<char*>(m_vars[i].ptr) + m_vars[i].disp,
-                        m_vars[i].size);
-                }
-                break;
-
-            case c_dv:
-                break;
-
-            case c_string_ptr:
-            case c_data_ptr:
-            case c_cean_var_ptr:
-            case c_dv_ptr:
-                if (m_vars[i].free_if &&
-                    src_is_for_mic &&
-                    !m_vars[i].flags.is_static) {
-                    void *buf = *static_cast<char**>(m_vars[i].ptr) -
-                                    m_vars[i].mic_offset -
-                                    (m_vars[i].flags.is_stack_buf?
-                                     0 : m_vars[i].offset);
-                    if (buf == NULL) {
-                        break;
-                    }
-                    // decrement buffer reference count
-                    OFFLOAD_TIMER_START(c_offload_target_release_buffer_refs);
-                    BufReleaseRef(buf);
-                    OFFLOAD_TIMER_STOP(c_offload_target_release_buffer_refs);
-                }
-                break;
-
-            case c_func_ptr:
-                if (m_vars[i].direction.out) {
-                    m_out.send_func_ptr(*((void**) m_vars[i].ptr));
-                }
-                break;
-
-            case c_dv_data:
-            case c_dv_ptr_data:
-            case c_dv_data_slice:
-            case c_dv_ptr_data_slice:
-                if (src_is_for_mic &&
-                    m_vars[i].free_if &&
-                    !m_vars[i].flags.is_static) {
-                    ArrDesc *dvp = (m_vars[i].type.src == c_dv_data ||
-                                    m_vars[i].type.src == c_dv_data_slice) ?
-                        static_cast<ArrDesc*>(m_vars[i].ptr) :
-                        *static_cast<ArrDesc**>(m_vars[i].ptr);
-
-                    void *buf = reinterpret_cast<char*>(dvp->Base) -
-                                m_vars[i].mic_offset -
-                                m_vars[i].offset;
-
-                    if (buf == NULL) {
-                        break;
-                    }
-
-                    // decrement buffer reference count
-                    OFFLOAD_TIMER_START(c_offload_target_release_buffer_refs);
-                    BufReleaseRef(buf);
-                    OFFLOAD_TIMER_STOP(c_offload_target_release_buffer_refs);
-                }
-                break;
-
-            default:
-                LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.dst);
-                abort();
-        }
-
-        if (m_vars[i].into) {
-            switch (m_vars[i].type.dst) {
-                case c_data_ptr_array:
-                    break;
-                case c_data:
-                case c_void_ptr:
-                case c_cean_var:
-                case c_dv:
-                    break;
-
-                case c_string_ptr:
-                case c_data_ptr:
-                case c_cean_var_ptr:
-                case c_dv_ptr:
-                    if (m_vars[i].direction.in &&
-                        m_vars[i].free_if &&
-                        !m_vars[i].flags.is_static_dstn) {
-                        void *buf = *static_cast<char**>(m_vars[i].into) -
-                                    m_vars[i].mic_offset -
-                                    (m_vars[i].flags.is_stack_buf?
-                                     0 : m_vars[i].offset);
-
-                        if (buf == NULL) {
-                            break;
-                        }
-                        // decrement buffer reference count
-                        OFFLOAD_TIMER_START(
-                            c_offload_target_release_buffer_refs);
-                        BufReleaseRef(buf);
-                        OFFLOAD_TIMER_STOP(
-                            c_offload_target_release_buffer_refs);
-                    }
-                    break;
-
-                case c_func_ptr:
-                    break;
-
-                case c_dv_data:
-                case c_dv_ptr_data:
-                case c_dv_data_slice:
-                case c_dv_ptr_data_slice:
-                    if (m_vars[i].free_if &&
-                        m_vars[i].direction.in &&
-                        !m_vars[i].flags.is_static_dstn) {
-                        ArrDesc *dvp =
-                            (m_vars[i].type.dst == c_dv_data_slice ||
-                             m_vars[i].type.dst == c_dv_data) ?
-                            static_cast<ArrDesc*>(m_vars[i].into) :
-                            *static_cast<ArrDesc**>(m_vars[i].into);
-                        void *buf = reinterpret_cast<char*>(dvp->Base) -
-                              m_vars[i].mic_offset -
-                              m_vars[i].offset;
-
-                        if (buf == NULL) {
-                            break;
-                        }
-                        // decrement buffer reference count
-                        OFFLOAD_TIMER_START(
-                            c_offload_target_release_buffer_refs);
-                        BufReleaseRef(buf);
-                        OFFLOAD_TIMER_STOP(
-                            c_offload_target_release_buffer_refs);
-                    }
-                    break;
-
-                default:
-                    LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.dst);
-                    abort();
-            }
-        }
-    }
-
-    OFFLOAD_DEBUG_TRACE(2, "OUT buffer @ p %p size %lld\n",
-                        m_out.get_buffer_start(),
-                        m_out.get_buffer_size());
-
-    OFFLOAD_DEBUG_DUMP_BYTES(2,
-                             m_out.get_buffer_start(),
-                             m_out.get_buffer_size());
-
-    OFFLOAD_DEBUG_TRACE_1(1, get_offload_number(), c_offload_copyout_data,
-                  "Total copyout data sent to host: [%lld] bytes\n",
-                  m_out.get_tfr_size());
-
-    OFFLOAD_TIMER_STOP(c_offload_target_gather_outputs);
-}
-
-void __offload_target_init(void)
-{
-#ifdef SEP_SUPPORT
-    const char* env_var = getenv(sep_monitor_env);
-    if (env_var != 0 && *env_var != '\0') {
-        sep_monitor = atoi(env_var);
-    }
-    env_var = getenv(sep_device_env);
-    if (env_var != 0 && *env_var != '\0') {
-        sep_device = env_var;
-    }
-#endif // SEP_SUPPORT
-
-    prefix = report_get_message_str(c_report_mic);
-
-    // init frequency
-    mic_frequency = COIPerfGetCycleFrequency();
-}
-
-// User-visible offload API
-
-int _Offload_number_of_devices(void)
-{
-    return mic_engines_total;
-}
-
-int _Offload_get_device_number(void)
-{
-    return mic_index;
-}
-
-int _Offload_get_physical_device_number(void)
-{
-    uint32_t index;
-    EngineGetIndex(&index);
-    return index;
-}
diff --git a/offload/src/offload_target.h b/offload/src/offload_target.h
deleted file mode 100644
index 7db3147..0000000
--- a/offload/src/offload_target.h
+++ /dev/null
@@ -1,100 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-// The parts of the offload library used only on the target
-
-#ifndef OFFLOAD_TARGET_H_INCLUDED
-#define OFFLOAD_TARGET_H_INCLUDED
-
-#include "offload_common.h"
-#include "coi/coi_server.h"
-
-// The offload descriptor.
-class OffloadDescriptor
-{
-public:
-    ~OffloadDescriptor() {
-        if (m_vars != 0) {
-            free(m_vars);
-        }
-    }
-
-    // Entry point for COI. Synchronously execute offloaded region given
-    // the provided buffers, misc and return data.
-    static void offload(
-        uint32_t  buffer_count,
-        void**    buffers,
-        void*     misc_data,
-        uint16_t  misc_data_len,
-        void*     return_data,
-        uint16_t  return_data_len
-    );
-
-    // scatters input data from in buffer to target variables
-    void scatter_copyin_data();
-
-    // gathers output data to the buffer
-    void gather_copyout_data();
-
-    // merges local variable descriptors with the descriptors received from
-    // host
-    void merge_var_descs(VarDesc *vars, VarDesc2 *vars2, int vars_total);
-
-    int get_offload_number() const {
-        return m_offload_number;
-    }
-
-    void set_offload_number(int number) {
-        m_offload_number = number;
-    }
-
-private:
-    // Constructor
-    OffloadDescriptor() : m_vars(0)
-    {}
-
-private:
-    typedef std::list<void*> BufferList;
-
-    // The Marshaller for the inputs of the offloaded region.
-    Marshaller m_in;
-
-    // The Marshaller for the outputs of the offloaded region.
-    Marshaller m_out;
-
-    // List of buffers that are passed to dispatch call
-    BufferList m_buffers;
-
-    // Variable descriptors received from host
-    VarDesc* m_vars;
-    int      m_vars_total;
-    int      m_offload_number;
-};
-
-// one time target initialization in main
-extern void __offload_target_init(void);
-
-// logical device index
-extern int mic_index;
-
-// total number of available logical devices
-extern int mic_engines_total;
-
-// device frequency (from COI)
-extern uint64_t mic_frequency;
-
-struct RefInfo {
-    RefInfo(bool is_add, long amount):is_added(is_add),count(amount)
-    {}
-    bool is_added;
-    long count;
-};
-
-#endif // OFFLOAD_TARGET_H_INCLUDED
diff --git a/offload/src/offload_target_main.cpp b/offload/src/offload_target_main.cpp
deleted file mode 100644
index a4921d2..0000000
--- a/offload/src/offload_target_main.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-extern "C" void __offload_target_main(void);
-
-int main(int argc, char ** argv)
-{
-    __offload_target_main();
-    return 0;
-}
diff --git a/offload/src/offload_timer.h b/offload/src/offload_timer.h
deleted file mode 100644
index 1401a9d..0000000
--- a/offload/src/offload_timer.h
+++ /dev/null
@@ -1,172 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef OFFLOAD_TIMER_H_INCLUDED
-#define OFFLOAD_TIMER_H_INCLUDED
-
-#include <stdio.h>
-#include <stdarg.h>
-#include <stdint.h>
-#include "liboffload_error_codes.h"
-
-extern int timer_enabled;
-
-#ifdef TIMING_SUPPORT
-
-struct OffloadTargetTimerData {
-    uint64_t frequency;
-    struct {
-        uint64_t start;
-        uint64_t total;
-    } phases[c_offload_target_max_phase];
-};
-
-struct OffloadHostTimerData {
-    // source file name and line number
-    const char* file;
-    int         line;
-
-    // host timer data
-    struct {
-        uint64_t start;
-        uint64_t total;
-    } phases[c_offload_host_max_phase];
-
-    uint64_t sent_bytes;
-    uint64_t received_bytes;
-    int card_number;
-    int offload_number;
-
-    // target timer data
-    OffloadTargetTimerData target;
-
-    // next element
-    OffloadHostTimerData *next;
-};
-
-#if HOST_LIBRARY
-
-extern int offload_report_level;
-extern int offload_report_enabled;
-#define OFFLOAD_REPORT_1 1
-#define OFFLOAD_REPORT_2 2
-#define OFFLOAD_REPORT_3 3
-#define OFFLOAD_REPORT_ON 1
-#define OFFLOAD_REPORT_OFF 0
-
-#define OFFLOAD_TIMER_DATALEN() \
-    ((timer_enabled || (offload_report_level && offload_report_enabled)) ? \
-     ((1 + c_offload_target_max_phase) * sizeof(uint64_t)) : 0)
-
-#define OFFLOAD_TIMER_START(timer_data, pnode) \
-    if (timer_enabled || \
-        (offload_report_level && offload_report_enabled)) { \
-        offload_timer_start(timer_data, pnode); \
-    }
-
-#define OFFLOAD_TIMER_STOP(timer_data, pnode) \
-    if (timer_enabled || \
-        (offload_report_level && offload_report_enabled)) { \
-        offload_timer_stop(timer_data, pnode); \
-    }
-
-#define OFFLOAD_TIMER_INIT(file, line) \
-    offload_timer_init(file, line);
-
-#define OFFLOAD_TIMER_TARGET_DATA(timer_data, data) \
-    if (timer_enabled || \
-        (offload_report_level && offload_report_enabled)) { \
-        offload_timer_fill_target_data(timer_data, data); \
-    }
-
-#define OFFLOAD_TIMER_HOST_SDATA(timer_data, data) \
-    if (offload_report_level && offload_report_enabled) { \
-        offload_timer_fill_host_sdata(timer_data, data); \
-    }
-
-#define OFFLOAD_TIMER_HOST_RDATA(timer_data, data) \
-    if (offload_report_level && offload_report_enabled) { \
-        offload_timer_fill_host_rdata(timer_data, data); \
-    }
-
-#define OFFLOAD_TIMER_HOST_MIC_NUM(timer_data, data) \
-    if (offload_report_level && offload_report_enabled) { \
-        offload_timer_fill_host_mic_num(timer_data, data); \
-    }
-
-extern void offload_timer_start(OffloadHostTimerData *,
-                                OffloadHostPhase t_node);
-extern void offload_timer_stop(OffloadHostTimerData *,
-                               OffloadHostPhase t_node);
-extern OffloadHostTimerData * offload_timer_init(const char *file, int line);
-extern void offload_timer_fill_target_data(OffloadHostTimerData *,
-                                           void *data);
-extern void offload_timer_fill_host_sdata(OffloadHostTimerData *,
-                                          uint64_t sent_bytes);
-extern void offload_timer_fill_host_rdata(OffloadHostTimerData *,
-                                          uint64_t sent_bytes);
-extern void offload_timer_fill_host_mic_num(OffloadHostTimerData *,
-                                            int card_number);
-
-// Utility structure for starting/stopping timer
-struct OffloadTimer {
-    OffloadTimer(OffloadHostTimerData *data, OffloadHostPhase phase) :
-        m_data(data),
-        m_phase(phase)
-    {
-        OFFLOAD_TIMER_START(m_data, m_phase);
-    }
-
-    ~OffloadTimer()
-    {
-        OFFLOAD_TIMER_STOP(m_data, m_phase);
-    }
-
-private:
-    OffloadHostTimerData*   m_data;
-    OffloadHostPhase        m_phase;
-};
-
-#else
-
-#define OFFLOAD_TIMER_DATALEN() \
-    ((timer_enabled) ? \
-     ((1 + c_offload_target_max_phase) * sizeof(uint64_t)) : 0)
-
-#define OFFLOAD_TIMER_START(pnode) \
-    if (timer_enabled) offload_timer_start(pnode);
-
-#define OFFLOAD_TIMER_STOP(pnode) \
-    if (timer_enabled) offload_timer_stop(pnode);
-
-#define OFFLOAD_TIMER_INIT() \
-    if (timer_enabled) offload_timer_init();
-
-#define OFFLOAD_TIMER_TARGET_DATA(data) \
-    if (timer_enabled) offload_timer_fill_target_data(data);
-
-extern void offload_timer_start(OffloadTargetPhase t_node);
-extern void offload_timer_stop(OffloadTargetPhase t_node);
-extern void offload_timer_init(void);
-extern void offload_timer_fill_target_data(void *data);
-
-#endif // HOST_LIBRARY
-
-#else // TIMING_SUPPORT
-
-#define OFFLOAD_TIMER_START(...)
-#define OFFLOAD_TIMER_STOP(...)
-#define OFFLOAD_TIMER_INIT(...)
-#define OFFLOAD_TIMER_TARGET_DATA(...)
-#define OFFLOAD_TIMER_DATALEN(...)      (0)
-
-#endif // TIMING_SUPPORT
-
-#endif // OFFLOAD_TIMER_H_INCLUDED
diff --git a/offload/src/offload_timer_host.cpp b/offload/src/offload_timer_host.cpp
deleted file mode 100644
index fb27db0..0000000
--- a/offload/src/offload_timer_host.cpp
+++ /dev/null
@@ -1,359 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_timer.h"
-
-#ifdef __INTEL_COMPILER
-#include <ia32intrin.h>
-#else // __INTEL_COMPILER
-#include <x86intrin.h>
-#endif // __INTEL_COMPILER
-
-#include "offload_host.h"
-#include <sstream>
-#include <iostream>
-#include <iomanip>
-
-int timer_enabled = 0;
-
-#ifdef TIMING_SUPPORT
-
-int offload_report_level = 0;
-int offload_report_enabled = 1;
-
-static const int host_timer_prefix_spaces[] = {
-    /*c_offload_host_setup_buffers*/         0,
-    /*c_offload_host_initialize*/            2,
-    /*c_offload_host_target_acquire*/        2,
-    /*c_offload_host_wait_deps*/             2,
-    /*c_offload_host_setup_buffers*/         2,
-    /*c_offload_host_alloc_buffers*/         4,
-    /*c_offload_host_setup_misc_data*/       2,
-    /*c_offload_host_alloc_data_buffer*/     4,
-    /*c_offload_host_send_pointers*/         2,
-    /*c_offload_host_gather_inputs*/         2,
-    /*c_offload_host_map_in_data_buffer*/    4,
-    /*c_offload_host_unmap_in_data_buffer*/  4,
-    /*c_offload_host_start_compute*/         2,
-    /*c_offload_host_wait_compute*/          2,
-    /*c_offload_host_start_buffers_reads*/   2,
-    /*c_offload_host_scatter_outputs*/       2,
-    /*c_offload_host_map_out_data_buffer*/   4,
-    /*c_offload_host_unmap_out_data_buffer*/ 4,
-    /*c_offload_host_wait_buffers_reads*/    2,
-    /*c_offload_host_destroy_buffers*/       2
-};
-
-const static int target_timer_prefix_spaces[] = {
-/*c_offload_target_total_time*/          0,
-/*c_offload_target_descriptor_setup*/    2,
-/*c_offload_target_func_lookup*/         2,
-/*c_offload_target_func_time*/           2,
-/*c_offload_target_scatter_inputs*/      4,
-/*c_offload_target_add_buffer_refs*/     6,
-/*c_offload_target_compute*/             4,
-/*c_offload_target_gather_outputs*/      4,
-/*c_offload_target_release_buffer_refs*/ 6
-};
-
-static OffloadHostTimerData* timer_data_head;
-static OffloadHostTimerData* timer_data_tail;
-static mutex_t               timer_data_mutex;
-
-static void offload_host_phase_name(std::stringstream &ss, int p_node);
-static void offload_target_phase_name(std::stringstream &ss, int p_node);
-
-extern void Offload_Timer_Print(void)
-{
-    std::string       buf;
-    std::stringstream ss;
-    const char *stars =
-        "**************************************************************";
-
-    ss << "\n\n" << stars << "\n";
-    ss << "                             ";
-    ss << report_get_message_str(c_report_title) << "\n";
-    ss << stars << "\n";
-    double frequency = cpu_frequency;
-
-    for (OffloadHostTimerData *pnode = timer_data_head;
-         pnode != 0; pnode = pnode->next) {
-        ss << "      ";
-        ss << report_get_message_str(c_report_from_file) << " "<< pnode->file;
-        ss << report_get_message_str(c_report_line) << " " << pnode->line;
-        ss << "\n";
-        for (int i = 0; i < c_offload_host_max_phase ; i++) {
-            ss << "          ";
-            offload_host_phase_name(ss, i);
-            ss << "   " << std::fixed << std::setprecision(5);
-            ss << (double)pnode->phases[i].total / frequency << "\n";
-        }
-
-        for (int i = 0; i < c_offload_target_max_phase ; i++) {
-            double time = 0;
-            if (pnode->target.frequency != 0) {
-                time = (double) pnode->target.phases[i].total /
-                       (double) pnode->target.frequency;
-            }
-            ss << "          ";
-            offload_target_phase_name(ss, i);
-            ss << "   " << std::fixed << std::setprecision(5);
-            ss << time << "\n";
-        }
-    }
-
-    buf = ss.str();
-    fprintf(stdout, buf.data());
-    fflush(stdout);
-}
-
-extern void Offload_Report_Prolog(OffloadHostTimerData *pnode)
-{
-    double frequency = cpu_frequency;
-    std::string       buf;
-    std::stringstream ss;
-
-    if (pnode) {
-        // [Offload] [Mic 0] [File]          file.c
-        ss << "[" << report_get_message_str(c_report_offload) << "] [";
-        ss << report_get_message_str(c_report_mic) << " ";
-        ss << pnode->card_number << "] [";
-        ss << report_get_message_str(c_report_file);
-        ss << "]                    " << pnode->file << "\n";
-
-        // [Offload] [Mic 0] [Line]          1234
-        ss << "[" << report_get_message_str(c_report_offload) << "] [";
-        ss << report_get_message_str(c_report_mic) << " ";
-        ss << pnode->card_number << "] [";
-        ss << report_get_message_str(c_report_line);
-        ss << "]                    " << pnode->line << "\n";
-
-        // [Offload] [Mic 0] [Tag]          Tag 1
-        ss << "[" << report_get_message_str(c_report_offload) << "] [";
-        ss << report_get_message_str(c_report_mic) << " ";
-        ss << pnode->card_number << "] [";
-        ss << report_get_message_str(c_report_tag);
-        ss << "]                     " << report_get_message_str(c_report_tag);
-        ss << " " << pnode->offload_number << "\n";
-
-        buf = ss.str();
-        fprintf(stdout, buf.data());
-        fflush(stdout);
-    }
-}
-
-extern void Offload_Report_Epilog(OffloadHostTimerData * timer_data)
-{
-    double frequency = cpu_frequency;
-    std::string       buf;
-    std::stringstream ss;
-
-    OffloadHostTimerData *pnode = timer_data;
-
-    if (!pnode) {
-        return;
-    }
-    ss << "[" << report_get_message_str(c_report_offload) << "] [";
-    ss << report_get_message_str(c_report_host) << "]  [";
-    ss << report_get_message_str(c_report_tag) <<  " ";
-    ss << pnode->offload_number << "] [";
-    ss << report_get_message_str(c_report_cpu_time) << "]        ";
-    ss << std::fixed << std::setprecision(6);
-    ss << (double) pnode->phases[0].total / frequency;
-    ss << report_get_message_str(c_report_seconds) << "\n";
-
-    if (offload_report_level >= OFFLOAD_REPORT_2) {
-        ss << "[" << report_get_message_str(c_report_offload) << "] [";
-        ss << report_get_message_str(c_report_mic);
-        ss << " " << pnode->card_number;
-        ss << "] [" << report_get_message_str(c_report_tag) << " ";
-        ss <<  pnode->offload_number << "] [";
-        ss << report_get_message_str(c_report_cpu_to_mic_data) << "]   ";
-        ss << pnode->sent_bytes << " ";
-        ss << report_get_message_str(c_report_bytes) << "\n";
-    }
-
-    double time = 0;
-    if (pnode->target.frequency != 0) {
-        time = (double) pnode->target.phases[0].total /
-            (double) pnode->target.frequency;
-    }
-    ss << "[" << report_get_message_str(c_report_offload) << "] [";
-    ss << report_get_message_str(c_report_mic) << " ";
-    ss << pnode->card_number<< "] [";
-    ss << report_get_message_str(c_report_tag) <<  " ";
-    ss << pnode->offload_number << "] [";
-    ss << report_get_message_str(c_report_mic_time) << "]        ";
-    ss << std::fixed << std::setprecision(6) << time;
-    ss << report_get_message_str(c_report_seconds) << "\n";
-
-    if (offload_report_level >= OFFLOAD_REPORT_2) {
-        ss << "[" << report_get_message_str(c_report_offload) << "] [";
-        ss << report_get_message_str(c_report_mic);
-        ss << " " << pnode->card_number;
-        ss << "] [" << report_get_message_str(c_report_tag) << " ";
-        ss <<  pnode->offload_number << "] [";
-        ss << report_get_message_str(c_report_mic_to_cpu_data) << "]   ";
-        ss << pnode->received_bytes << " ";
-        ss << report_get_message_str(c_report_bytes) << "\n";
-    }
-    ss << "\n";
-
-    buf = ss.str();
-    fprintf(stdout, buf.data());
-    fflush(stdout);
-
-    offload_report_free_data(timer_data);
-}
-
-extern void offload_report_free_data(OffloadHostTimerData * timer_data)
-{
-    OffloadHostTimerData *pnode_last = NULL;
-
-    for (OffloadHostTimerData *pnode = timer_data_head;
-         pnode != 0; pnode = pnode->next) {
-        if (timer_data == pnode) {
-            if (pnode_last) {
-                pnode_last->next = pnode->next;
-            }
-            else {
-                timer_data_head = pnode->next;
-            }
-            OFFLOAD_FREE(pnode);
-            break;
-        }
-        pnode_last = pnode;
-    }
-}
-
-static void fill_buf_with_spaces(std::stringstream &ss, int num)
-{
-    for (; num > 0; num--) {
-        ss << " ";
-    }
-}
-
-static void offload_host_phase_name(std::stringstream &ss, int p_node)
-{
-    int prefix_spaces;
-    int str_length;
-    int tail_length;
-    const int message_length = 40;
-    char const *str;
-
-    str = report_get_host_stage_str(p_node);
-    prefix_spaces = host_timer_prefix_spaces[p_node];
-    fill_buf_with_spaces(ss, prefix_spaces);
-    str_length = strlen(str);
-    ss << str;
-    tail_length = message_length - prefix_spaces - str_length;
-    tail_length = tail_length > 0? tail_length : 1;
-    fill_buf_with_spaces(ss, tail_length);
-}
-
-static void offload_target_phase_name(std::stringstream &ss, int p_node)
-{
-    int prefix_spaces;
-    int str_length;
-    const int message_length = 40;
-    int tail_length;
-    char const *str;
-
-    str = report_get_target_stage_str(p_node);
-    prefix_spaces = target_timer_prefix_spaces[p_node];
-    fill_buf_with_spaces(ss, prefix_spaces);
-    str_length = strlen(str);
-    ss << str;
-    tail_length = message_length - prefix_spaces - str_length;
-    tail_length = (tail_length > 0)? tail_length : 1;
-    fill_buf_with_spaces(ss, tail_length);
-}
-
-void offload_timer_start(OffloadHostTimerData * timer_data,
-                         OffloadHostPhase p_type)
-{
-    timer_data->phases[p_type].start = _rdtsc();
-}
-
-void offload_timer_stop(OffloadHostTimerData * timer_data,
-                        OffloadHostPhase p_type)
-{
-    timer_data->phases[p_type].total += _rdtsc() -
-                                        timer_data->phases[p_type].start;
-}
-
-void offload_timer_fill_target_data(OffloadHostTimerData * timer_data,
-                                    void *buf)
-{
-    uint64_t *data = (uint64_t*) buf;
-
-    timer_data->target.frequency = *data++;
-    for (int i = 0; i < c_offload_target_max_phase; i++) {
-        timer_data->target.phases[i].total = *data++;
-    }
-}
-
-void offload_timer_fill_host_sdata(OffloadHostTimerData * timer_data,
-                                   uint64_t sent_bytes)
-{
-    if (timer_data) {
-        timer_data->sent_bytes += sent_bytes;
-    }
-}
-
-void offload_timer_fill_host_rdata(OffloadHostTimerData * timer_data,
-                                   uint64_t received_bytes)
-{
-    if (timer_data) {
-        timer_data->received_bytes += received_bytes;
-    }
-}
-
-void offload_timer_fill_host_mic_num(OffloadHostTimerData * timer_data,
-                                     int card_number)
-{
-    if (timer_data) {
-        timer_data->card_number = card_number;
-    }
-}
-
-OffloadHostTimerData* offload_timer_init(const char *file, int line)
-{
-    static bool first_time = true;
-    OffloadHostTimerData* timer_data = NULL;
-
-    timer_data_mutex.lock();
-    {
-        if (timer_enabled ||
-            (offload_report_level && offload_report_enabled)) {
-            timer_data = (OffloadHostTimerData*)
-                OFFLOAD_MALLOC(sizeof(OffloadHostTimerData), 0);
-            memset(timer_data, 0, sizeof(OffloadHostTimerData));
-
-            timer_data->offload_number = OFFLOAD_DEBUG_INCR_OFLD_NUM() - 1;
-
-            if (timer_data_head == 0) {
-                timer_data_head = timer_data;
-                timer_data_tail = timer_data;
-            }
-            else {
-                timer_data_tail->next = timer_data;
-                timer_data_tail = timer_data;
-            }
-
-            timer_data->file = file;
-            timer_data->line = line;
-        }
-    }
-    timer_data_mutex.unlock();
-    return timer_data;
-}
-
-#endif // TIMING_SUPPORT
diff --git a/offload/src/offload_timer_target.cpp b/offload/src/offload_timer_target.cpp
deleted file mode 100644
index 30a4c91..0000000
--- a/offload/src/offload_timer_target.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_timer.h"
-#include "offload_target.h"
-
-#ifdef __INTEL_COMPILER
-#include <ia32intrin.h>
-#else // __INTEL_COMPILER
-#include <x86intrin.h>
-#endif // __INTEL_COMPILER
-
-
-
-int timer_enabled = 0;
-
-#ifdef TIMING_SUPPORT
-
-#if defined(LINUX) || defined(FREEBSD)
-static __thread OffloadTargetTimerData timer_data;
-#else // WINNT
-static __declspec(thread) OffloadTargetTimerData timer_data;
-#endif // defined(LINUX) || defined(FREEBSD)
-
-
-void offload_timer_start(
-    OffloadTargetPhase p_type
-)
-{
-    timer_data.phases[p_type].start = _rdtsc();
-}
-
-void offload_timer_stop(
-    OffloadTargetPhase p_type
-)
-{
-    timer_data.phases[p_type].total += _rdtsc() -
-                                       timer_data.phases[p_type].start;
-}
-
-void offload_timer_init()
-{
-    memset(&timer_data, 0, sizeof(OffloadTargetTimerData));
-}
-
-void offload_timer_fill_target_data(
-    void *buf
-)
-{
-    uint64_t *data = (uint64_t*) buf;
-
-    timer_data.frequency = mic_frequency;
-    memcpy(data++, &(timer_data.frequency), sizeof(uint64_t));
-
-    for (int i = 0; i < c_offload_target_max_phase; i++) {
-        memcpy(data++, &(timer_data.phases[i].total), sizeof(uint64_t));
-    }
-}
-
-#endif // TIMING_SUPPORT
diff --git a/offload/src/offload_trace.cpp b/offload/src/offload_trace.cpp
deleted file mode 100644
index 0a06204..0000000
--- a/offload/src/offload_trace.cpp
+++ /dev/null
@@ -1,309 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_trace.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <sstream>
-#include "liboffload_error_codes.h"
-
-extern const char *prefix;
-
-#if !HOST_LIBRARY
-extern int mic_index;
-#endif
-
-// The debug routines
-
-static const char * offload_stage(std::stringstream &ss,
-                                  int offload_number,
-                                  const char *tag,
-                                  const char *text,
-                                  bool print_tag)
-{
-    ss << "[" << report_get_message_str(c_report_offload) << "]";
-#if HOST_LIBRARY
-    ss << " [" << prefix << "]";
-    if (print_tag) {
-        ss << "  [" << report_get_message_str(c_report_tag);
-        ss << " " << offload_number << "]";
-    }
-    else {
-        ss << "         ";
-    }
-    ss << " [" << tag << "]";
-    ss << "           " << text;
-#else
-    ss << " [" << prefix << " " << mic_index << "]";
-    if (print_tag) {
-        ss << " [" << report_get_message_str(c_report_tag);
-        ss << " " << offload_number << "]";
-    }
-    ss << " [" << tag << "]";
-    ss << "           " << text;
-#endif
-    return 0;
-}
-
-static const char * offload_signal(std::stringstream &ss,
-                                  int offload_number,
-                                  const char *tag,
-                                  const char *text)
-{
-    ss << "[" << report_get_message_str(c_report_offload) << "]";
-    ss << " [" << prefix << "]";
-    ss << "  [" << report_get_message_str(c_report_tag);
-    ss << " " << offload_number << "]";
-    ss << " [" << tag << "]";
-    ss << "          " << text;
-    return 0;
-}
-
-void offload_stage_print(int stage, int offload_number, ...)
-{
-    std::string buf;
-    std::stringstream ss;
-    char const *str1;
-    char const *str2;
-    va_list va_args;
-    va_start(va_args, offload_number);
-    va_arg(va_args, char*);
-
-    switch (stage) {
-        case c_offload_start:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_start);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_init:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_init);
-            offload_stage(ss, offload_number, str1, str2, false);
-            ss << " " << report_get_message_str(c_report_logical_card);
-            ss << " " << va_arg(va_args, int);
-            ss << " = " << report_get_message_str(c_report_physical_card);
-            ss << " " << va_arg(va_args, int);
-            break;
-        case c_offload_register:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_register);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_init_func:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_init_func);
-            offload_stage(ss, offload_number, str1, str2, true);
-            ss << ": " << va_arg(va_args, char*);
-            break;
-        case c_offload_create_buf_host:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_create_buf_host);
-            offload_stage(ss, offload_number, str1, str2, true);
-            ss << ": base=0x" << std::hex << va_arg(va_args, uint64_t);
-            ss << " length=" << std::dec << va_arg(va_args, uint64_t);
-            break;
-        case c_offload_create_buf_mic:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_create_buf_mic);
-            offload_stage(ss, offload_number, str1, str2, true);
-            ss << ": size=" << va_arg(va_args, uint64_t);
-            ss << " offset=" << va_arg(va_args, int);
-            if (va_arg(va_args,int))
-               ss << " (2M page)";
-            break;
-        case c_offload_send_pointer_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_send_pointer_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_sent_pointer_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_sent_pointer_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            ss << " " << va_arg(va_args, uint64_t);
-            break;
-        case c_offload_gather_copyin_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_gather_copyin_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_copyin_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_copyin_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            ss << " " << va_arg(va_args, uint64_t) << " ";
-            break;
-        case c_offload_compute:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_compute);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_receive_pointer_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_receive_pointer_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_received_pointer_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_received_pointer_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            ss << " " << va_arg(va_args, uint64_t);
-            break;
-        case c_offload_start_target_func:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_start_target_func);
-            offload_stage(ss, offload_number, str1, str2, true);
-            ss << ": " << va_arg(va_args, char*);
-            break;
-        case c_offload_var:
-            str1 = report_get_message_str(c_report_var);
-            offload_stage(ss, offload_number, str1, "  ", true);
-            va_arg(va_args, int);
-            ss << va_arg(va_args, char*);
-            ss << " " << " " << va_arg(va_args, char*);
-            break;
-        case c_offload_scatter_copyin_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_scatter_copyin_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_gather_copyout_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_gather_copyout_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_scatter_copyout_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_scatter_copyout_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_copyout_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_copyout_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            ss << "   " << va_arg(va_args, uint64_t);
-            break;
-        case c_offload_signal:
-            {
-                uint64_t  *signal;
-                str1 = report_get_message_str(c_report_state_signal);
-                str2 = report_get_message_str(c_report_signal);
-                offload_signal(ss, offload_number, str1, str2);
-	        signal = va_arg(va_args, uint64_t*);
-	        if (signal)
-                   ss << " 0x" << std::hex << *signal;
-                else
-                   ss << " none";
-            }
-            break;
-        case c_offload_wait:
-            {
-                int count;
-                uint64_t  **signal;
-                str1 = report_get_message_str(c_report_state_signal);
-                str2 = report_get_message_str(c_report_wait);
-                offload_signal(ss, offload_number, str1, str2);
-                count = va_arg(va_args, int);
-                signal = va_arg(va_args, uint64_t**);
-                if (count) {
-                    while (count) {
-                        ss << " " << std::hex << signal[count-1];
-                        count--;
-                    }
-                }
-                else
-                    ss << " none";
-            }
-            break;
-        case c_offload_unregister:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_unregister);
-            offload_stage(ss, offload_number, str1, str2, false);
-            break;
-        case c_offload_destroy:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_destroy);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_myoinit:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myoinit);
-            offload_stage(ss, offload_number, str1, str2, false);
-            break;
-        case c_offload_myoregister:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myoregister);
-            offload_stage(ss, offload_number, str1, str2, false);
-            break;
-        case c_offload_myofini:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myofini);
-            offload_stage(ss, offload_number, str1, str2, false);
-            break;
-        case c_offload_mic_myo_shared:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_mic_myo_shared);
-            offload_stage(ss, offload_number, str1, str2, false);
-            ss << " " << va_arg(va_args, char*);
-            break;
-        case c_offload_mic_myo_fptr:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_mic_myo_fptr);
-            offload_stage(ss, offload_number, str1, str2, false);
-            ss << " " << va_arg(va_args, char*);
-            break;
-        case c_offload_myosharedmalloc:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myosharedmalloc);
-            offload_stage(ss, offload_number, str1, str2, false);
-            va_arg(va_args, char*);
-            ss << " " << va_arg(va_args, size_t);
-            break;
-        case c_offload_myosharedfree:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myosharedfree);
-            offload_stage(ss, offload_number, str1, str2, false);
-            break;
-        case c_offload_myosharedalignedmalloc:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myosharedalignedmalloc);
-            offload_stage(ss, offload_number, str1, str2, false);
-            va_arg(va_args, char*);
-            ss << " " << va_arg(va_args, size_t);
-            ss << " " << va_arg(va_args, size_t);
-            break;
-        case c_offload_myosharedalignedfree:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myosharedalignedfree);
-            offload_stage(ss, offload_number, str1, str2, false);
-            break;
-        case c_offload_myoacquire:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myoacquire);
-            offload_stage(ss, offload_number, str1, str2, false);
-            break;
-        case c_offload_myorelease:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myorelease);
-            offload_stage(ss, offload_number, str1, str2, false);
-            break;
-        default:
-            LIBOFFLOAD_ERROR(c_report_unknown_trace_node);
-            abort();
-    }
-    ss << "\n";
-    buf = ss.str();
-    fprintf(stdout, buf.data());
-    fflush(stdout);
-
-    va_end(va_args);
-    return;
-}
diff --git a/offload/src/offload_trace.h b/offload/src/offload_trace.h
deleted file mode 100644
index 65c28a4..0000000
--- a/offload/src/offload_trace.h
+++ /dev/null
@@ -1,52 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-// The parts of the offload library common to host and target
-
-void offload_stage_print(int stage, int offload_number, ...);
-
-enum OffloadTraceStage {
-    // Total time spent on the target
-    c_offload_start = 0,
-    c_offload_init,
-    c_offload_register,
-    c_offload_init_func,
-    c_offload_create_buf_host,
-    c_offload_create_buf_mic,
-    c_offload_send_pointer_data,
-    c_offload_sent_pointer_data,
-    c_offload_gather_copyin_data,
-    c_offload_copyin_data,
-    c_offload_compute,
-    c_offload_receive_pointer_data,
-    c_offload_received_pointer_data,
-    c_offload_start_target_func,
-    c_offload_var,
-    c_offload_scatter_copyin_data,
-    c_offload_gather_copyout_data,
-    c_offload_scatter_copyout_data,
-    c_offload_copyout_data,
-    c_offload_signal,
-    c_offload_wait,
-    c_offload_unregister,
-    c_offload_destroy,
-    c_offload_finish,
-    c_offload_myoinit,
-    c_offload_myoregister,
-    c_offload_mic_myo_shared,
-    c_offload_mic_myo_fptr,
-    c_offload_myosharedmalloc,
-    c_offload_myosharedfree,
-    c_offload_myosharedalignedmalloc,
-    c_offload_myosharedalignedfree,
-    c_offload_myoacquire,
-    c_offload_myorelease,
-    c_offload_myofini
-};
diff --git a/offload/src/offload_util.cpp b/offload/src/offload_util.cpp
deleted file mode 100644
index 68462c8..0000000
--- a/offload/src/offload_util.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_util.h"
-#include <errno.h>
-#include "liboffload_error_codes.h"
-
-#ifdef TARGET_WINNT
-void *thread_getspecific(pthread_key_t key)
-{
-    if (key == 0) {
-        return NULL;
-    }
-    else {
-        return TlsGetValue(key);
-    }
-}
-
-int thread_setspecific(pthread_key_t key, const void *value)
-{
-    return (TlsSetValue(key, (LPVOID)value)) ? 0 : GetLastError();
-}
-#endif // TARGET_WINNT
-
-bool __offload_parse_size_string(const char *str, uint64_t &new_size)
-{
-    uint64_t val;
-    char *suffix;
-
-    errno = 0;
-#ifdef TARGET_WINNT
-    val = strtoul(str, &suffix, 10);
-#else // TARGET_WINNT
-    val = strtoull(str, &suffix, 10);
-#endif // TARGET_WINNT
-    if (errno != 0 || suffix == str) {
-        return false;
-    }
-
-    if (suffix[0] == '\0') {
-        // default is Kilobytes
-        new_size = val * 1024;
-        return true;
-    }
-    else if (suffix[1] == '\0') {
-        // Optional suffixes: B (bytes), K (Kilobytes), M (Megabytes),
-        // G (Gigabytes), or T (Terabytes) specify the units.
-        switch (suffix[0]) {
-            case 'b':
-            case 'B':
-                new_size = val;
-                break;
-
-            case 'k':
-            case 'K':
-                new_size = val * 1024;
-                break;
-
-            case 'm':
-            case 'M':
-                new_size = val * 1024 * 1024;
-                break;
-
-            case 'g':
-            case 'G':
-                new_size = val * 1024 * 1024 * 1024;
-                break;
-
-            case 't':
-            case 'T':
-                new_size = val * 1024 * 1024 * 1024 * 1024;
-                break;
-
-            default:
-                return false;
-        }
-        return true;
-    }
-
-    return false;
-}
-
-bool __offload_parse_int_string(const char *str, int64_t &value)
-{
-    int64_t val;
-    char *suffix;
-
-    errno = 0;
-#ifdef TARGET_WINNT
-    val = strtol(str, &suffix, 0);
-#else
-    val = strtoll(str, &suffix, 0);
-#endif
-    if (errno == 0 && suffix != str && *suffix == '\0') {
-        value = val;
-        return true;
-    }
-    return false;
-}
-
-#ifdef TARGET_WINNT
-extern void* DL_open(const char *path)
-{
-    void *handle;
-    int error_mode;
-
-    /*
-     * do not display message box with error if it the call below fails to
-     * load dynamic library.
-     */
-    error_mode = SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOOPENFILEERRORBOX);
-
-    /* load dynamic library */
-    handle = (void*) LoadLibrary(path);
-
-    /* restore error mode */
-    SetErrorMode(error_mode);
-
-    return handle;
-}
-
-extern int DL_addr(const void *addr, Dl_info *dl_info)
-{
-    MEMORY_BASIC_INFORMATION mem_info;
-    char mod_name[MAX_PATH];
-    HMODULE mod_handle;
-
-    /* Fill MEMORY_BASIC_INFORMATION struct */
-    if (!VirtualQuery(addr, &mem_info, sizeof(mem_info))) {
-        return 0;
-    }
-    mod_handle = (HMODULE)mem_info.AllocationBase;
-
-    /* ANSI file name for module */
-    if (!GetModuleFileNameA(mod_handle, (char*) mod_name, sizeof(mod_name))) {
-        return 0;
-    }
-    strcpy(dl_info->dli_fname, mod_name);
-    dl_info->dli_fbase = mem_info.BaseAddress;
-    dl_info->dli_saddr = addr;
-    strcpy(dl_info->dli_sname, mod_name);
-    return 1;
-}
-
-// Run once
-static BOOL CALLBACK __offload_run_once_wrapper(
-    PINIT_ONCE initOnce,
-    PVOID parameter,
-    PVOID *context
-)
-{
-    void (*init_routine)(void) = (void(*)(void)) parameter;
-    init_routine();
-    return true;
-}
-
-void __offload_run_once(OffloadOnceControl *ctrl, void (*func)(void))
-{
-    InitOnceExecuteOnce(ctrl, __offload_run_once_wrapper, (void*) func, 0);
-}
-#endif // TARGET_WINNT
-
-/* ARGSUSED */ // version is not used on windows
-void* DL_sym(void *handle, const char *name, const char *version)
-{
-#ifdef TARGET_WINNT
-    return GetProcAddress((HMODULE) handle, name);
-#else // TARGET_WINNT
-    if (version == 0) {
-        return dlsym(handle, name);
-    }
-    else {
-        return dlvsym(handle, name, version);
-    }
-#endif // TARGET_WINNT
-}
-
-int64_t get_el_value(
-                     char *base,
-                     int64_t offset,
-                     int64_t size)
-{
-    int64_t val = 0;
-    switch (size) {
-        case 1:
-            val = static_cast<int64_t>(*((char *)(base + offset)));
-            break;
-        case 2:
-            val = static_cast<int64_t>(*((short *)(base + offset)));
-            break;
-        case 4:
-            val = static_cast<int64_t>(*((int *)(base + offset)));
-            break;
-        default:
-            val = *((int64_t *)(base + offset));
-            break;
-    }
-    return val;
-}
diff --git a/offload/src/offload_util.h b/offload/src/offload_util.h
deleted file mode 100644
index e50d77d..0000000
--- a/offload/src/offload_util.h
+++ /dev/null
@@ -1,153 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef OFFLOAD_UTIL_H_INCLUDED
-#define OFFLOAD_UTIL_H_INCLUDED
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-
-#ifdef TARGET_WINNT
-#include <windows.h>
-#include <process.h>
-#else // TARGET_WINNT
-#include <dlfcn.h>
-#include <pthread.h>
-#endif // TARGET_WINNT
-
-#ifdef TARGET_WINNT
-typedef unsigned pthread_key_t;
-typedef int pid_t;
-
-#define __func__ __FUNCTION__
-#define strtok_r(s,d,p) strtok_s(s,d,p)
-#define strcasecmp(a,b) stricmp(a,b)
-
-#define thread_key_create(key, destructor) \
-    (((*key = TlsAlloc()) > 0) ? 0 : GetLastError())
-#define thread_key_delete(key) TlsFree(key)
-
-#ifndef S_ISREG
-#define S_ISREG(mode)  (((mode) & S_IFMT) == S_IFREG)
-#endif
-
-void*   thread_getspecific(pthread_key_t key);
-int     thread_setspecific(pthread_key_t key, const void *value);
-#else
-#define thread_key_create(key, destructor) \
-            pthread_key_create((key), (destructor))
-#define thread_key_delete(key)  pthread_key_delete(key)
-#define thread_getspecific(key) pthread_getspecific(key)
-#define thread_setspecific(key, value) pthread_setspecific(key, value)
-#endif // TARGET_WINNT
-
-// Mutex implementation
-struct mutex_t {
-    mutex_t() {
-#ifdef TARGET_WINNT
-        InitializeCriticalSection(&m_lock);
-#else // TARGET_WINNT
-        pthread_mutex_init(&m_lock, 0);
-#endif // TARGET_WINNT
-    }
-
-    ~mutex_t() {
-#ifdef TARGET_WINNT
-        DeleteCriticalSection(&m_lock);
-#else // TARGET_WINNT
-        pthread_mutex_destroy(&m_lock);
-#endif // TARGET_WINNT
-    }
-
-    void lock() {
-#ifdef TARGET_WINNT
-        EnterCriticalSection(&m_lock);
-#else // TARGET_WINNT
-        pthread_mutex_lock(&m_lock);
-#endif // TARGET_WINNT
-    }
-
-    void unlock() {
-#ifdef TARGET_WINNT
-        LeaveCriticalSection(&m_lock);
-#else // TARGET_WINNT
-        pthread_mutex_unlock(&m_lock);
-#endif // TARGET_WINNT
-    }
-
-private:
-#ifdef TARGET_WINNT
-    CRITICAL_SECTION    m_lock;
-#else
-    pthread_mutex_t     m_lock;
-#endif
-};
-
-struct mutex_locker_t {
-    mutex_locker_t(mutex_t &mutex) : m_mutex(mutex) {
-        m_mutex.lock();
-    }
-
-    ~mutex_locker_t() {
-        m_mutex.unlock();
-    }
-
-private:
-    mutex_t &m_mutex;
-};
-
-// Dynamic loader interface
-#ifdef TARGET_WINNT
-struct Dl_info
-{
-    char        dli_fname[MAX_PATH];
-    void       *dli_fbase;
-    char        dli_sname[MAX_PATH];
-    const void *dli_saddr;
-};
-
-void*   DL_open(const char *path);
-#define DL_close(handle)        FreeLibrary((HMODULE) (handle))
-int     DL_addr(const void *addr, Dl_info *info);
-#else
-#define DL_open(path)           dlopen((path), RTLD_NOW)
-#define DL_close(handle)        dlclose(handle)
-#define DL_addr(addr, info)     dladdr((addr), (info))
-#endif // TARGET_WINNT
-
-extern void* DL_sym(void *handle, const char *name, const char *version);
-
-// One-time initialization API
-#ifdef TARGET_WINNT
-typedef INIT_ONCE                   OffloadOnceControl;
-#define OFFLOAD_ONCE_CONTROL_INIT   INIT_ONCE_STATIC_INIT
-
-extern void __offload_run_once(OffloadOnceControl *ctrl, void (*func)(void));
-#else
-typedef pthread_once_t              OffloadOnceControl;
-#define OFFLOAD_ONCE_CONTROL_INIT   PTHREAD_ONCE_INIT
-
-#define __offload_run_once(ctrl, func) pthread_once(ctrl, func)
-#endif // TARGET_WINNT
-
-// Parses size specification string.
-extern bool __offload_parse_size_string(const char *str, uint64_t &new_size);
-
-// Parses string with integer value
-extern bool __offload_parse_int_string(const char *str, int64_t &value);
-
-// get value by its base, offset and size
-int64_t get_el_value(
-    char   *base,
-    int64_t offset,
-    int64_t size
-);
-#endif // OFFLOAD_UTIL_H_INCLUDED
diff --git a/offload/src/ofldbegin.cpp b/offload/src/ofldbegin.cpp
deleted file mode 100644
index 945f982..0000000
--- a/offload/src/ofldbegin.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#if HOST_LIBRARY
-#include "offload_host.h"
-#include "offload_myo_host.h"
-#else
-#include "compiler_if_target.h"
-#include "offload_target.h"
-#include "offload_myo_target.h"
-#endif
-
-#ifdef TARGET_WINNT
-#define ALLOCATE(name) __declspec(allocate(name))
-#define DLL_LOCAL
-#else // TARGET_WINNT
-#define ALLOCATE(name) __attribute__((section(name)))
-#define DLL_LOCAL  __attribute__((visibility("hidden")))
-#endif // TARGET_WINNT
-
-#if HOST_LIBRARY
-// the host program/shared library should always have __offload_target_image
-// symbol defined. This symbol specifies the beginning of the target program
-// image.
-extern "C" DLL_LOCAL const void* __offload_target_image;
-#else // HOST_LIBRARY
-// Define a weak main which would be used on target side in case usere's
-// source file containing main does not have offload code.
-#pragma weak main
-int main(void)
-{
-    OFFLOAD_TARGET_MAIN();
-    return 0;
-}
-
-#pragma weak MAIN__
-extern "C" int MAIN__(void)
-{
-    OFFLOAD_TARGET_MAIN();
-    return 0;
-}
-#endif // HOST_LIBRARY
-
-// offload section prolog
-ALLOCATE(OFFLOAD_ENTRY_TABLE_SECTION_START)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(FuncTable::Entry)))
-#endif // TARGET_WINNT
-static FuncTable::Entry __offload_entry_table_start = { 0 };
-
-// list element for the current module
-static FuncList::Node __offload_entry_node = {
-    { &__offload_entry_table_start + 1, -1 },
-    0, 0
-};
-
-// offload fp section prolog
-ALLOCATE(OFFLOAD_FUNC_TABLE_SECTION_START)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(FuncTable::Entry)))
-#endif // TARGET_WINNT
-static FuncTable::Entry __offload_func_table_start = { 0 };
-
-// list element for the current module
-static FuncList::Node __offload_func_node = {
-    { &__offload_func_table_start + 1, -1 },
-    0, 0
-};
-
-// offload fp section prolog
-ALLOCATE(OFFLOAD_VAR_TABLE_SECTION_START)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(VarTable::Entry)))
-#endif // TARGET_WINNT
-static VarTable::Entry __offload_var_table_start = { 0 };
-
-// list element for the current module
-static VarList::Node __offload_var_node = {
-    { &__offload_var_table_start + 1 },
-    0, 0
-};
-
-#ifdef MYO_SUPPORT
-
-// offload myo shared var section prolog
-ALLOCATE(OFFLOAD_MYO_SHARED_TABLE_SECTION_START)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(SharedTableEntry)))
-#endif // TARGET_WINNT
-static SharedTableEntry __offload_myo_shared_table_start = { 0 };
-
-#if HOST_LIBRARY
-// offload myo shared var init section prolog
-ALLOCATE(OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_START)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(InitTableEntry)))
-#endif // TARGET_WINNT
-static InitTableEntry __offload_myo_shared_init_table_start = { 0 };
-#endif
-
-// offload myo fptr section prolog
-ALLOCATE(OFFLOAD_MYO_FPTR_TABLE_SECTION_START)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(FptrTableEntry)))
-#endif // TARGET_WINNT
-static FptrTableEntry __offload_myo_fptr_table_start = { 0 };
-
-#endif // MYO_SUPPORT
-
-// init/fini code which adds/removes local lookup data to/from the global list
-
-static void offload_fini();
-
-#ifndef TARGET_WINNT
-static void offload_init() __attribute__((constructor(101)));
-#else // TARGET_WINNT
-static void offload_init();
-
-// Place offload initialization before user constructors
-ALLOCATE(OFFLOAD_CRTINIT_SECTION_START)
-static void (*addressof_offload_init)() = offload_init;
-#endif // TARGET_WINNT
-
-static void offload_init()
-{
-    // register offload tables
-    __offload_register_tables(&__offload_entry_node,
-                              &__offload_func_node,
-                              &__offload_var_node);
-
-#if HOST_LIBRARY
-    __offload_register_image(&__offload_target_image);
-    atexit(offload_fini);
-#endif // HOST_LIBRARY
-
-#ifdef MYO_SUPPORT
-    __offload_myoRegisterTables(
-#if HOST_LIBRARY
-        &__offload_myo_shared_init_table_start + 1,
-#endif // HOST_LIBRARY
-        &__offload_myo_shared_table_start + 1,
-        &__offload_myo_fptr_table_start + 1
-    );
-#endif // MYO_SUPPORT
-}
-
-static void offload_fini()
-{
-#if HOST_LIBRARY
-    __offload_unregister_image(&__offload_target_image);
-#endif // HOST_LIBRARY
-
-    // unregister offload tables
-    __offload_unregister_tables(&__offload_entry_node,
-                                &__offload_func_node,
-                                &__offload_var_node);
-}
diff --git a/offload/src/ofldend.cpp b/offload/src/ofldend.cpp
deleted file mode 100644
index f61fe59..0000000
--- a/offload/src/ofldend.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#if HOST_LIBRARY
-#include "offload_host.h"
-#include "offload_myo_host.h"
-#else
-#include "offload_target.h"
-#include "offload_myo_target.h"
-#endif
-
-#ifdef TARGET_WINNT
-#define ALLOCATE(name) __declspec(allocate(name))
-#else // TARGET_WINNT
-#define ALLOCATE(name) __attribute__((section(name)))
-#endif // TARGET_WINNT
-
-// offload entry table
-ALLOCATE(OFFLOAD_ENTRY_TABLE_SECTION_END)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(FuncTable::Entry)))
-#endif // TARGET_WINNT
-static FuncTable::Entry __offload_entry_table_end = { (const char*)-1 };
-
-// offload function table
-ALLOCATE(OFFLOAD_FUNC_TABLE_SECTION_END)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(FuncTable::Entry)))
-#endif // TARGET_WINNT
-static FuncTable::Entry __offload_func_table_end = { (const char*)-1 };
-
-// data table
-ALLOCATE(OFFLOAD_VAR_TABLE_SECTION_END)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(VarTable::Entry)))
-#endif // TARGET_WINNT
-static VarTable::Entry __offload_var_table_end = { (const char*)-1 };
-
-#ifdef MYO_SUPPORT
-
-// offload myo shared var section epilog
-ALLOCATE(OFFLOAD_MYO_SHARED_TABLE_SECTION_END)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(SharedTableEntry)))
-static SharedTableEntry __offload_myo_shared_table_end = { (const char*)-1, 0 };
-#else // TARGET_WINNT
-static SharedTableEntry __offload_myo_shared_table_end = { 0 };
-#endif // TARGET_WINNT
-
-#if HOST_LIBRARY
-// offload myo shared var init section epilog
-ALLOCATE(OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_END)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(InitTableEntry)))
-static InitTableEntry __offload_myo_shared_init_table_end = { (const char*)-1, 0 };
-#else // TARGET_WINNT
-static InitTableEntry __offload_myo_shared_init_table_end = { 0 };
-#endif // TARGET_WINNT
-#endif // HOST_LIBRARY
-
-// offload myo fptr section epilog
-ALLOCATE(OFFLOAD_MYO_FPTR_TABLE_SECTION_END)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(FptrTableEntry)))
-static FptrTableEntry __offload_myo_fptr_table_end = { (const char*)-1, 0, 0 };
-#else // TARGET_WINNT
-static FptrTableEntry __offload_myo_fptr_table_end = { 0 };
-#endif // TARGET_WINNT
-
-#endif // MYO_SUPPORT
diff --git a/offload/src/orsl-lite/include/orsl-lite.h b/offload/src/orsl-lite/include/orsl-lite.h
deleted file mode 100644
index f26a335..0000000
--- a/offload/src/orsl-lite/include/orsl-lite.h
+++ /dev/null
@@ -1,221 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef _ORSL_LITE_H_
-#define _ORSL_LITE_H_
-
-#ifndef TARGET_WINNT
-#include <sched.h>
-#else
-#define cpu_set_t int
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/** Type of a ORSLBusySet */
-typedef enum ORSLBusySetType {
-    BUSY_SET_EMPTY = 0,     /**< Empty set */
-    BUSY_SET_PARTIAL = 1,   /**< Non-empty set that omits some threads */
-    BUSY_SET_FULL = 2       /**< A set that includes all threads on the card */
-} BusySetType;
-
-/** ORSLBusySet encapsulation */
-typedef struct ORSLBusySet {
-    BusySetType type;   /**< Set type */
-#ifdef __linux__
-    cpu_set_t cpu_set;  /**< CPU mask (unused for BUSY_SET_EMPTY and
-                           BUSY_SET_PARTIAL sets) represented by the standard
-                           Linux CPU set type -- cpu_set_t. Threads are numbered
-                           starting from 0. The maximal possible thread number
-                           is system-specific. See CPU_SET(3) family of macros
-                           for more details. Unused in ORSL Lite. */
-#endif
-} ORSLBusySet;
-
-/** Client tag */
-typedef char* ORSLTag;
-
-/** Maximal length of tag in characters */
-#define ORSL_MAX_TAG_LEN 128
-
-/** Maximal number of cards that can be managed by ORSL */
-#define ORSL_MAX_CARDS 32
-
-/** Reserves computational resources on a set of cards. Blocks.
- *
- * If any of the resources cannot be reserved, this function will block until
- * they become available. Reservation can be recursive if performed by the
- * same tag. A recursively reserved resource must be released the same number
- * of times it was reserved.
- *
- * @see ORSLTryReserve
- *
- * @param[in]  n      Number of cards to reserve resources on. Cannot be < 0
- *                    or > ORSL_MAX_CARDS.
- *
- * @param[in]  inds   Indices of the cards: an integer array with n elements.
- *                    Cannot be NULL if n > 0. Valid card indices are from 0
- *                    to ORSL_MAX_CARDS-1. Cannot contain duplicate elements.
- *
- * @param[in]  bsets  Requested resources on each of the card. Cannot be NULL
- *                    if n > 0.
- *
- * @param[in]  tag    ORSLTag of the calling client. Cannot be NULL. Length
- *                    must not exeed ORSL_MAX_TAG_LEN.
- *
- * @returns    0      if the resources were successfully reserved
- *
- * @returns    EINVAL if any of the arguments is invalid
- *
- * @returns    EAGAIN limit of recursive reservations reached
- *                    (not in ORSL Lite)
- *
- * @returns    ENOSYS (in ORSL Lite) if type of any of the busy sets is
- *                    equal to BUSY_SET_PARTIAL
- */
-int ORSLReserve(const int n, const int *__restrict inds,
-                const ORSLBusySet *__restrict bsets,
-                const ORSLTag __restrict tag);
-
-/** Reserves computational resources on a set of cards. Does not block.
- *
- * If any of the resources cannot be reserved, this function will return
- * immediately. Reservation can be recursive if performed by the same tag.
- * A recursively reserved resource must be released the same number of times
- * it was reserved.
- *
- * @see ORSLReserve
- *
- * @param[in]  n      Number of cards to reserve resources on. Cannot be < 0
- *                    or > ORSL_MAX_CARDS.
- *
- * @param[in]  inds     Indices of the cards: an integer array with n elements.
- *                      Cannot be NULL if n > 0. Valid card indices are from 0
- *                      to ORSL_MAX_CARDS-1. Cannot contain duplicate elements.
- *
- * @param[inout] bsets  Requested resources on each of the card. Cannot be
- *                      NULL if n > 0.
- *
- * @param[in]    tag    ORSLTag of the calling client. Cannot be NULL. Length
- *                      must not exceed ORSL_MAX_TAG_LEN.
- *
- * @returns      0      if the resources were successfully reserved
- *
- * @returns      EBUSY  if some of the requested resources are busy
- *
- * @returns      EINVAL if any of the arguments is invalid
- *
- * @returns      EAGAIN limit of recursive reservations reached
- *                      (not in ORSL Lite)
- *
- * @returns      ENOSYS (in ORSL Lite) if type of any of the busy sets is
- *                      equal to BUSY_SET_PARTIAL
- */
-int ORSLTryReserve(const int n, const int *__restrict inds,
-                   const ORSLBusySet *__restrict bsets,
-                   const ORSLTag __restrict tag);
-
-/** Granularify of partial reservation */
-typedef enum ORSLPartialGranularity {
-    GRAN_CARD = 0, /**< Card granularity */
-    GRAN_THREAD = 1 /**< Thread granularity */
-} ORSLPartialGranularity;
-
-/** Requests reservation of some of computational resources on a set of cards.
- * Does not block. Updates user-provided bsets to indicate which resources
- * were reserved.
- *
- * If any of the resources cannot be reserved, this function will update busy
- * sets provided by the caller to reflect what resources were actually
- * reserved. This function supports two granularity modes: 'card' and
- * 'thread'.  When granularity is set to 'card', a failure to reserve a thread
- * on the card will imply that reservation has failed for the whole card. When
- * granularity is set to 'thread', reservation on a card will be considered
- * successful as long as at least one thread on the card was successfully
- * reserved. Reservation can be recursive if performed by the same tag. A
- * recursively reserved resource must be released the same number of times it
- * was reserved.
- *
- * @param[in]  gran   Reservation granularity
- *
- * @param[in]  n      Number of cards to reserve resources on. Cannot be < 0
- *                    or > ORSL_MAX_CARDS.
- *
- * @param[in]  inds   Indices of the cards: an integer array with n elements.
- *                    Cannot be NULL if n > 0. Valid card indices are from 0
- *                    to ORSL_MAX_CARDS-1. Cannot contain duplicate elements.
- *
- * @param[in]  bsets  Requested resources on each of the card. Cannot be NULL
- *                    if n > 0.
- *
- * @param[in]  tag    ORSLTag of the calling client. Cannot be NULL. Length
- *                    must not exceed ORSL_MAX_TAG_LEN.
- *
- * @returns    0      if at least some of the resources were successfully
- *                    reserved
- *
- * @returns    EBUSY  if all of the requested resources are busy
- *
- * @returns    EINVAL if any of the arguments is invalid
- *
- * @returns    EAGAIN limit of recursive reservations reached
- *                    (not in ORSL Lite)
- *
- * @returns    ENOSYS (in ORSL Lite) if type of any of the busy sets is
- *                    equal to BUSY_SET_PARTIAL
- */
-int ORSLReservePartial(const ORSLPartialGranularity gran, const int n,
-                       const int *__restrict inds,
-                       ORSLBusySet *__restrict bsets,
-                       const ORSLTag __restrict tag);
-
-/** Releases previously reserved computational resources on a set of cards.
- *
- * This function will fail if any of the resources to be released were not
- * reserved by the calling client.
- *
- * @see ORSLReserve
- * @see ORSLTryReserve
- * @see ORSLReservePartial
- *
- * @param[in]  n      Number of cards to reserve resources on. Cannot be < 0
- *                    or > ORSL_MAX_CARDS.
- *
- * @param[in]  inds   Indices of the cards: an integer array with n elements.
- *                    Cannot be NULL if n > 0. Valid card indices are from 0
- *                    to ORSL_MAX_CARDS-1. Cannot contain duplicate elements.
- *
- * @param[in]  bsets  Requested resources on each of the card. Cannot be NULL
- *                    if n > 0.
- *
- * @param[in]  tag    ORSLTag of the calling client. Cannot be NULL. Length
- *                    must not exceed ORSL_MAX_TAG_LEN.
- *
- * @returns    0      if the resources were successfully released
- *
- * @returns    EINVAL if any of the arguments is invalid
- *
- * @returns    EPERM  the calling client did not reserve some of the
- *                    resources it is trying to release.
- *
- * @returns    ENOSYS (in ORSL Lite) if type of any of the busy sets is
- *                    equal to BUSY_SET_PARTIAL
- */
-int ORSLRelease(const int n, const int *__restrict inds,
-                const ORSLBusySet *__restrict bsets,
-                const ORSLTag __restrict tag);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/offload/src/orsl-lite/lib/orsl-lite.c b/offload/src/orsl-lite/lib/orsl-lite.c
deleted file mode 100644
index 221cda7..0000000
--- a/offload/src/orsl-lite/lib/orsl-lite.c
+++ /dev/null
@@ -1,337 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include <errno.h>
-#include <string.h>
-#include <limits.h>
-#include <assert.h>
-
-#include "orsl-lite/include/orsl-lite.h"
-
-#define DISABLE_SYMBOL_VERSIONING
-
-#if defined(__linux__) && !defined(DISABLE_SYMBOL_VERSIONING)
-#define symver(src, tgt, verstr) __asm__(".symver " #src "," #tgt verstr)
-symver(ORSLReserve0, ORSLReserve, "@@ORSL_0.0");
-symver(ORSLTryReserve0, ORSLTryReserve, "@@ORSL_0.0");
-symver(ORSLReservePartial0, ORSLReservePartial, "@@ORSL_0.0");
-symver(ORSLRelease0, ORSLRelease, "@@ORSL_0.0");
-#else
-#define ORSLReserve0 ORSLReserve
-#define ORSLTryReserve0 ORSLTryReserve
-#define ORSLReservePartial0 ORSLReservePartial
-#define ORSLRelease0 ORSLRelease
-#endif
-
-#ifdef __linux__
-#include <pthread.h>
-static pthread_mutex_t global_mutex = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t release_cond = PTHREAD_COND_INITIALIZER;
-#endif
-
-#ifdef _WIN32
-#include <windows.h>
-#pragma intrinsic(_ReadWriteBarrier)
-static SRWLOCK global_mutex = SRWLOCK_INIT;
-static volatile int release_cond_initialized = 0;
-static CONDITION_VARIABLE release_cond;
-
-static void state_lazy_init_sync()
-{
-    if (!release_cond_initialized) {
-        AcquireSRWLockExclusive(&global_mutex);
-        _ReadWriteBarrier();
-        if (!release_cond_initialized) {
-            InitializeConditionVariable(&release_cond);
-            release_cond_initialized = 1;
-        }
-        ReleaseSRWLockExclusive(&global_mutex);
-    }
-}
-#endif
-
-static int state_lock()
-{
-#ifdef __linux__
-    return pthread_mutex_lock(&global_mutex);
-#endif
-
-#ifdef _WIN32
-    AcquireSRWLockExclusive(&global_mutex);
-    return 0;
-#endif
-}
-
-static int state_unlock()
-{
-#ifdef __linux__
-    return pthread_mutex_unlock(&global_mutex);
-#endif
-
-#ifdef _WIN32
-    ReleaseSRWLockExclusive(&global_mutex);
-    return 0;
-#endif
-}
-
-static int state_wait_for_release()
-{
-#ifdef __linux__
-    return pthread_cond_wait(&release_cond, &global_mutex);
-#endif
-
-#ifdef _WIN32
-    return SleepConditionVariableSRW(&release_cond,
-            &global_mutex, INFINITE, 0) == 0 ? 1 : 0;
-#endif
-}
-
-static int state_signal_release()
-{
-#ifdef __linux__
-    return pthread_cond_signal(&release_cond);
-#endif
-
-#ifdef _WIN32
-    WakeConditionVariable(&release_cond);
-    return 0;
-#endif
-}
-
-static struct {
-    char owner[ORSL_MAX_TAG_LEN + 1];
-    unsigned long rsrv_cnt;
-} rsrv_data[ORSL_MAX_CARDS];
-
-static int check_args(const int n, const int *__restrict inds,
-                      const ORSLBusySet *__restrict bsets,
-                      const ORSLTag __restrict tag)
-{
-    int i;
-    int card_specified[ORSL_MAX_CARDS];
-    if (tag == NULL) return -1;
-    if (strlen((char *)tag) > ORSL_MAX_TAG_LEN) return -1;
-    if (n < 0 || n >= ORSL_MAX_CARDS) return -1;
-    if (n != 0 && (inds == NULL || bsets == NULL)) return -1;
-    for (i = 0; i < ORSL_MAX_CARDS; i++)
-        card_specified[i] = 0;
-    for (i = 0; i < n; i++) {
-        int ind = inds[i];
-        if (ind < 0 || ind >= ORSL_MAX_CARDS) return -1;
-        if (card_specified[ind]) return -1;
-        card_specified[ind] = 1;
-    }
-    return 0;
-}
-
-static int check_bsets(const int n, const ORSLBusySet *bsets)
-{
-    int i;
-    for (i = 0; i < n; i++)
-        if (bsets[i].type == BUSY_SET_PARTIAL) return -1;
-    return 0;
-}
-
-static int can_reserve_card(int card, const ORSLBusySet *__restrict bset,
-                            const ORSLTag __restrict tag)
-{
-    assert(tag != NULL);
-    assert(bset != NULL);
-    assert(strlen((char *)tag) < ORSL_MAX_TAG_LEN);
-    assert(bset->type != BUSY_SET_PARTIAL);
-
-    return (bset->type == BUSY_SET_EMPTY ||
-            ((rsrv_data[card].rsrv_cnt == 0 ||
-            strncmp((char *)tag,
-                rsrv_data[card].owner, ORSL_MAX_TAG_LEN) == 0) &&
-            rsrv_data[card].rsrv_cnt < ULONG_MAX)) ? 0 : - 1;
-}
-
-static void reserve_card(int card, const ORSLBusySet *__restrict bset,
-                         const ORSLTag __restrict tag)
-{
-    assert(tag != NULL);
-    assert(bset != NULL);
-    assert(strlen((char *)tag) < ORSL_MAX_TAG_LEN);
-    assert(bset->type != BUSY_SET_PARTIAL);
-
-    if (bset->type == BUSY_SET_EMPTY)
-        return;
-
-    assert(rsrv_data[card].rsrv_cnt == 0 ||
-            strncmp((char *)tag,
-                rsrv_data[card].owner, ORSL_MAX_TAG_LEN) == 0);
-    assert(rsrv_data[card].rsrv_cnt < ULONG_MAX);
-
-    if (rsrv_data[card].rsrv_cnt == 0)
-        strncpy(rsrv_data[card].owner, (char *)tag, ORSL_MAX_TAG_LEN);
-    rsrv_data[card].owner[ORSL_MAX_TAG_LEN] = '\0';
-    rsrv_data[card].rsrv_cnt++;
-}
-
-static int can_release_card(int card, const ORSLBusySet *__restrict bset,
-                            const ORSLTag __restrict tag)
-{
-    assert(tag != NULL);
-    assert(bset != NULL);
-    assert(strlen((char *)tag) < ORSL_MAX_TAG_LEN);
-    assert(bset->type != BUSY_SET_PARTIAL);
-
-    return (bset->type == BUSY_SET_EMPTY || (rsrv_data[card].rsrv_cnt > 0 &&
-                strncmp((char *)tag,
-                    rsrv_data[card].owner, ORSL_MAX_TAG_LEN) == 0)) ? 0 : 1;
-}
-
-static void release_card(int card, const ORSLBusySet *__restrict bset,
-                         const ORSLTag __restrict tag)
-{
-    assert(tag != NULL);
-    assert(bset != NULL);
-    assert(strlen((char *)tag) < ORSL_MAX_TAG_LEN);
-    assert(bset->type != BUSY_SET_PARTIAL);
-
-    if (bset->type == BUSY_SET_EMPTY)
-        return;
-
-    assert(strncmp((char *)tag,
-                rsrv_data[card].owner, ORSL_MAX_TAG_LEN) == 0);
-    assert(rsrv_data[card].rsrv_cnt > 0);
-
-    rsrv_data[card].rsrv_cnt--;
-}
-
-int ORSLReserve0(const int n, const int *__restrict inds,
-                const ORSLBusySet *__restrict bsets,
-                const ORSLTag __restrict tag)
-{
-    int i, ok;
-
-    if (n == 0) return 0;
-    if (check_args(n, inds, bsets, tag) != 0) return EINVAL;
-    if (check_bsets(n, bsets) != 0) return ENOSYS;
-
-    state_lock();
-
-    /* Loop until we find that all the resources we want are available */
-    do {
-        ok = 1;
-        for (i = 0; i < n; i++)
-            if (can_reserve_card(inds[i], &bsets[i], tag) != 0) {
-                ok = 0;
-                /* Wait for someone to release some resources */
-                state_wait_for_release();
-                break;
-            }
-    } while (!ok);
-
-    /* At this point we are good to reserve_card the resources we want */
-    for (i = 0; i < n; i++)
-        reserve_card(inds[i], &bsets[i], tag);
-
-    state_unlock();
-    return 0;
-}
-
-int ORSLTryReserve0(const int n, const int *__restrict inds,
-                   const ORSLBusySet *__restrict bsets,
-                   const ORSLTag __restrict tag)
-{
-    int i, rc = EBUSY;
-
-    if (n == 0) return 0;
-    if (check_args(n, inds, bsets, tag) != 0) return EINVAL;
-    if (check_bsets(n, bsets) != 0) return ENOSYS;
-
-    state_lock();
-
-    /* Check resource availability once */
-    for (i = 0; i < n; i++)
-        if (can_reserve_card(inds[i], &bsets[i], tag) != 0)
-            goto bail_out;
-
-    /* At this point we are good to reserve the resources we want */
-    for (i = 0; i < n; i++)
-        reserve_card(inds[i], &bsets[i], tag);
-
-    rc = 0;
-
-bail_out:
-    state_unlock();
-    return rc;
-}
-
-int ORSLReservePartial0(const ORSLPartialGranularity gran, const int n,
-                       const int *__restrict inds, ORSLBusySet *__restrict bsets,
-                       const ORSLTag __restrict tag)
-{
-    int rc = EBUSY;
-    int i, num_avail = n;
-
-    if (n == 0) return 0;
-    if (gran != GRAN_CARD && gran != GRAN_THREAD) return EINVAL;
-    if (gran != GRAN_CARD) return EINVAL;
-    if (check_args(n, inds, bsets, tag) != 0) return EINVAL;
-    if (check_bsets(n, bsets) != 0) return ENOSYS;
-
-    state_lock();
-
-    /* Check resource availability once; remove unavailable resources from the
-     * user-provided list */
-    for (i = 0; i < n; i++)
-        if (can_reserve_card(inds[i], &bsets[i], tag) != 0) {
-            num_avail--;
-            bsets[i].type = BUSY_SET_EMPTY;
-        }
-
-    if (num_avail == 0)
-        goto bail_out;
-
-    /* At this point we are good to reserve the resources we want */
-    for (i = 0; i < n; i++)
-        reserve_card(inds[i], &bsets[i], tag);
-
-    rc = 0;
-
-bail_out:
-    state_unlock();
-    return rc;
-}
-
-int ORSLRelease0(const int n, const int *__restrict inds,
-                const ORSLBusySet *__restrict bsets,
-                const ORSLTag __restrict tag)
-{
-    int i, rc = EPERM;
-
-    if (n == 0) return 0;
-    if (check_args(n, inds, bsets, tag) != 0) return EINVAL;
-    if (check_bsets(n, bsets) != 0) return ENOSYS;
-
-    state_lock();
-
-    /* Check that we can release all the resources */
-    for (i = 0; i < n; i++)
-        if (can_release_card(inds[i], &bsets[i], tag) != 0)
-            goto bail_out;
-
-    /* At this point we are good to release the resources we want */
-    for (i = 0; i < n; i++)
-        release_card(inds[i], &bsets[i], tag);
-
-    state_signal_release();
-
-    rc = 0;
-
-bail_out:
-    state_unlock();
-    return rc;
-}
-
-/* vim:set et: */
diff --git a/offload/src/orsl-lite/version.txt b/offload/src/orsl-lite/version.txt
deleted file mode 100644
index ab5f599..0000000
--- a/offload/src/orsl-lite/version.txt
+++ /dev/null
@@ -1 +0,0 @@
-ORSL-lite 0.7
diff --git a/offload/src/rdtsc.h b/offload/src/rdtsc.h
deleted file mode 100644
index da91d71..0000000
--- a/offload/src/rdtsc.h
+++ /dev/null
@@ -1,17 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include <stdint.h>
-
-uint64_t _rdtsc()
-{
-  uint32_t eax, edx;
-  asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
-  return ((uint64_t)edx << 32) | eax;
-}
diff --git a/offload/src/use_mpss2.txt b/offload/src/use_mpss2.txt
deleted file mode 100644
index 948f483..0000000
--- a/offload/src/use_mpss2.txt
+++ /dev/null
@@ -1 +0,0 @@
-2.1.6720-13
diff --git a/offload/src/use_mpss_win.txt b/offload/src/use_mpss_win.txt
deleted file mode 100644
index 948f483..0000000
--- a/offload/src/use_mpss_win.txt
+++ /dev/null
@@ -1 +0,0 @@
-2.1.6720-13
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index d17d95e..447b3cd 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -302,7 +302,7 @@
 # OMPT-support defaults to ON for OpenMP 5.0+ and if the requirements in
 # cmake/config-ix.cmake are fulfilled.
 set(OMPT_DEFAULT FALSE)
-if ((${LIBOMP_OMP_VERSION} GREATER 49) AND (LIBOMP_HAVE_OMPT_SUPPORT))
+if ((${LIBOMP_OMP_VERSION} GREATER 49) AND (LIBOMP_HAVE_OMPT_SUPPORT) AND (NOT WIN32))
   set(OMPT_DEFAULT TRUE)
 endif()
 set(LIBOMP_OMPT_SUPPORT ${OMPT_DEFAULT} CACHE BOOL
@@ -331,6 +331,10 @@
   libomp_error_say("Hwloc requested but not available")
 endif()
 
+# Hierarchical scheduling support
+set(LIBOMP_USE_HIER_SCHED FALSE CACHE BOOL
+  "Hierarchical scheduling support?")
+
 # Setting final library name
 set(LIBOMP_DEFAULT_LIB_NAME libomp)
 if(${PROFILE_LIBRARY})
diff --git a/runtime/cmake/LibompHandleFlags.cmake b/runtime/cmake/LibompHandleFlags.cmake
index 4bdbf8f..efe2099 100644
--- a/runtime/cmake/LibompHandleFlags.cmake
+++ b/runtime/cmake/LibompHandleFlags.cmake
@@ -28,22 +28,25 @@
   set(flags_local)
   libomp_append(flags_local -fno-exceptions LIBOMP_HAVE_FNO_EXCEPTIONS_FLAG)
   libomp_append(flags_local -fno-rtti LIBOMP_HAVE_FNO_RTTI_FLAG)
-  libomp_append(flags_local -Wno-sign-compare LIBOMP_HAVE_WNO_SIGN_COMPARE_FLAG)
-  libomp_append(flags_local -Wno-unused-function LIBOMP_HAVE_WNO_UNUSED_FUNCTION_FLAG)
-  libomp_append(flags_local -Wno-unused-local-typedef LIBOMP_HAVE_WNO_UNUSED_LOCAL_TYPEDEF_FLAG)
-  libomp_append(flags_local -Wno-unused-value LIBOMP_HAVE_WNO_UNUSED_VALUE_FLAG)
-  libomp_append(flags_local -Wno-unused-variable LIBOMP_HAVE_WNO_UNUSED_VARIABLE_FLAG)
+  if(${OPENMP_STANDALONE_BUILD})
+    libomp_append(flags_local -Wsign-compare LIBOMP_HAVE_WNO_SIGN_COMPARE_FLAG)
+    libomp_append(flags_local -Wunused-function LIBOMP_HAVE_WNO_UNUSED_FUNCTION_FLAG)
+    libomp_append(flags_local -Wunused-local-typedef LIBOMP_HAVE_WNO_UNUSED_LOCAL_TYPEDEF_FLAG)
+    libomp_append(flags_local -Wunused-value LIBOMP_HAVE_WNO_UNUSED_VALUE_FLAG)
+    libomp_append(flags_local -Wunused-variable LIBOMP_HAVE_WNO_UNUSED_VARIABLE_FLAG)
+    libomp_append(flags_local -Wdeprecated-register LIBOMP_HAVE_WNO_DEPRECATED_REGISTER_FLAG)
+    libomp_append(flags_local -Wunknown-pragmas LIBOMP_HAVE_WNO_UNKNOWN_PRAGMAS_FLAG)
+    libomp_append(flags_local -Wcomment LIBOMP_HAVE_WNO_COMMENT_FLAG)
+    libomp_append(flags_local -Wself-assign LIBOMP_HAVE_WNO_SELF_ASSIGN_FLAG)
+    libomp_append(flags_local -Wformat-pedantic LIBOMP_HAVE_WNO_FORMAT_PEDANTIC_FLAG)
+  endif()
   libomp_append(flags_local -Wno-switch LIBOMP_HAVE_WNO_SWITCH_FLAG)
   libomp_append(flags_local -Wno-covered-switch-default LIBOMP_HAVE_WNO_COVERED_SWITCH_DEFAULT_FLAG)
-  libomp_append(flags_local -Wno-deprecated-register LIBOMP_HAVE_WNO_DEPRECATED_REGISTER_FLAG)
   libomp_append(flags_local -Wno-gnu-anonymous-struct LIBOMP_HAVE_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
-  libomp_append(flags_local -Wno-unknown-pragmas LIBOMP_HAVE_WNO_UNKNOWN_PRAGMAS_FLAG)
   libomp_append(flags_local -Wno-missing-field-initializers LIBOMP_HAVE_WNO_MISSING_FIELD_INITIALIZERS_FLAG)
   libomp_append(flags_local -Wno-missing-braces LIBOMP_HAVE_WNO_MISSING_BRACES_FLAG)
-  libomp_append(flags_local -Wno-comment LIBOMP_HAVE_WNO_COMMENT_FLAG)
-  libomp_append(flags_local -Wno-self-assign LIBOMP_HAVE_WNO_SELF_ASSIGN_FLAG)
   libomp_append(flags_local -Wno-vla-extension LIBOMP_HAVE_WNO_VLA_EXTENSION_FLAG)
-  libomp_append(flags_local -Wno-format-pedantic LIBOMP_HAVE_WNO_FORMAT_PEDANTIC_FLAG)
+  libomp_append(flags_local -Wstringop-overflow=0 LIBOMP_HAVE_WSTRINGOP_OVERFLOW_FLAG)
   libomp_append(flags_local /GS LIBOMP_HAVE_GS_FLAG)
   libomp_append(flags_local /EHsc LIBOMP_HAVE_EHSC_FLAG)
   libomp_append(flags_local /Oy- LIBOMP_HAVE_OY__FLAG)
@@ -87,6 +90,9 @@
 function(libomp_get_cxxflags cxxflags)
   set(cxxflags_local)
   libomp_get_c_and_cxxflags_common(cxxflags_local)
+  if(${OPENMP_STANDALONE_BUILD})
+      libomp_append(cxxflags_local -Wcast-qual LIBOMP_HAVE_WCAST_QUAL_FLAG)
+  endif()
   set(cxxflags_local ${cxxflags_local} ${LIBOMP_CXXFLAGS})
   libomp_setup_flags(cxxflags_local)
   set(${cxxflags} ${cxxflags_local} PARENT_SCOPE)
diff --git a/runtime/cmake/config-ix.cmake b/runtime/cmake/config-ix.cmake
index a20b18d..5415e57 100644
--- a/runtime/cmake/config-ix.cmake
+++ b/runtime/cmake/config-ix.cmake
@@ -50,6 +50,7 @@
 check_cxx_compiler_flag(-fno-exceptions LIBOMP_HAVE_FNO_EXCEPTIONS_FLAG)
 check_cxx_compiler_flag(-fno-rtti LIBOMP_HAVE_FNO_RTTI_FLAG)
 check_c_compiler_flag("-x c++" LIBOMP_HAVE_X_CPP_FLAG)
+check_cxx_compiler_flag(-Wcast-qual LIBOMP_HAVE_WCAST_QUAL_FLAG)
 check_c_compiler_flag(-Wunused-function LIBOMP_HAVE_WNO_UNUSED_FUNCTION_FLAG)
 check_c_compiler_flag(-Wunused-local-typedef LIBOMP_HAVE_WNO_UNUSED_LOCAL_TYPEDEF_FLAG)
 check_c_compiler_flag(-Wunused-value LIBOMP_HAVE_WNO_UNUSED_VALUE_FLAG)
@@ -66,6 +67,7 @@
 check_c_compiler_flag(-Wself-assign LIBOMP_HAVE_WNO_SELF_ASSIGN_FLAG)
 check_c_compiler_flag(-Wvla-extension LIBOMP_HAVE_WNO_VLA_EXTENSION_FLAG)
 check_c_compiler_flag(-Wformat-pedantic LIBOMP_HAVE_WNO_FORMAT_PEDANTIC_FLAG)
+check_c_compiler_flag(-Wstringop-overflow=0 LIBOMP_HAVE_WSTRINGOP_OVERFLOW_FLAG)
 check_c_compiler_flag(-msse2 LIBOMP_HAVE_MSSE2_FLAG)
 check_c_compiler_flag(-ftls-model=initial-exec LIBOMP_HAVE_FTLS_MODEL_FLAG)
 libomp_check_architecture_flag(-mmic LIBOMP_HAVE_MMIC_FLAG)
diff --git a/runtime/src/exports_so.txt b/runtime/src/exports_so.txt
index 3e22324..fc7c5f9 100644
--- a/runtime/src/exports_so.txt
+++ b/runtime/src/exports_so.txt
@@ -107,6 +107,8 @@
 } OMP_3.0;
 OMP_4.0 {
 } OMP_3.1;
+OMP_4.5 {
+} OMP_4.0;
 
 # sets up GCC GOMP_ version dependency chain
 GOMP_1.0 {
diff --git a/runtime/src/i18n/en_US.txt b/runtime/src/i18n/en_US.txt
index 6329374..6882b3a 100644
--- a/runtime/src/i18n/en_US.txt
+++ b/runtime/src/i18n/en_US.txt
@@ -423,6 +423,7 @@
 AffHWSubsetManyNodes         "KMP_HW_SUBSET ignored: too many NUMA Nodes requested."
 AffHWSubsetManyTiles         "KMP_HW_SUBSET ignored: too many L2 Caches requested."
 AffHWSubsetManyProcs         "KMP_HW_SUBSET ignored: too many Procs requested."
+HierSchedInvalid             "Hierarchy ignored: unsupported level: %1$s."
 
 
 # --------------------------------------------------------------------------------------------------
diff --git a/runtime/src/kmp.h b/runtime/src/kmp.h
index 94485a1..c6afb5e 100644
--- a/runtime/src/kmp.h
+++ b/runtime/src/kmp.h
@@ -83,6 +83,12 @@
 class kmp_stats_list;
 #endif
 
+#if KMP_USE_HIER_SCHED
+// Only include hierarchical scheduling if affinity is supported
+#undef KMP_USE_HIER_SCHED
+#define KMP_USE_HIER_SCHED KMP_AFFINITY_SUPPORTED
+#endif
+
 #if KMP_USE_HWLOC && KMP_AFFINITY_SUPPORTED
 #include "hwloc.h"
 #ifndef HWLOC_OBJ_NUMANODE
@@ -123,15 +129,6 @@
 #include "ompt-internal.h"
 #endif
 
-/*Select data placement in NUMA memory */
-#define NO_FIRST_TOUCH 0
-#define FIRST_TOUCH 1 /* Exploit SGI's first touch page placement algo */
-
-/* If not specified on compile command line, assume no first touch */
-#ifndef BUILD_MEMORY
-#define BUILD_MEMORY NO_FIRST_TOUCH
-#endif
-
 // 0 - no fast memory allocation, alignment: 8-byte on x86, 16-byte on x64.
 // 3 - fast allocation using sync, non-sync free lists of any size, non-self
 // free lists of limited size.
@@ -256,6 +253,12 @@
     while (*(_x) >= '0' && *(_x) <= '9')                                       \
       (_x)++;                                                                  \
   }
+#define SKIP_TOKEN(_x)                                                         \
+  {                                                                            \
+    while ((*(_x) >= '0' && *(_x) <= '9') || (*(_x) >= 'a' && *(_x) <= 'z') || \
+           (*(_x) >= 'A' && *(_x) <= 'Z') || *(_x) == '_')                     \
+      (_x)++;                                                                  \
+  }
 #define SKIP_TO(_x, _c)                                                        \
   {                                                                            \
     while (*(_x) != '\0' && *(_x) != (_c))                                     \
@@ -583,7 +586,7 @@
 #define KMP_AFFINITY_DISABLE() (__kmp_affin_mask_size = 0)
 #define KMP_AFFINITY_ENABLE(mask_size) (__kmp_affin_mask_size = mask_size)
 #define KMP_CPU_SET_ITERATE(i, mask)                                           \
-  for (i = (mask)->begin(); i != (mask)->end(); i = (mask)->next(i))
+  for (i = (mask)->begin(); (int)i != (mask)->end(); i = (mask)->next(i))
 #define KMP_CPU_SET(i, mask) (mask)->set(i)
 #define KMP_CPU_ISSET(i, mask) (mask)->is_set(i)
 #define KMP_CPU_CLR(i, mask) (mask)->clear(i)
@@ -827,36 +830,6 @@
 #define KMP_GTID_UNKNOWN (-5) /* Is not known */
 #define KMP_GTID_MIN (-6) /* Minimal gtid for low bound check in DEBUG */
 
-#define __kmp_get_gtid() __kmp_get_global_thread_id()
-#define __kmp_entry_gtid() __kmp_get_global_thread_id_reg()
-
-#define __kmp_tid_from_gtid(gtid)                                              \
-  (KMP_DEBUG_ASSERT((gtid) >= 0), __kmp_threads[(gtid)]->th.th_info.ds.ds_tid)
-
-#define __kmp_get_tid() (__kmp_tid_from_gtid(__kmp_get_gtid()))
-#define __kmp_gtid_from_tid(tid, team)                                         \
-  (KMP_DEBUG_ASSERT((tid) >= 0 && (team) != NULL),                             \
-   team->t.t_threads[(tid)]->th.th_info.ds.ds_gtid)
-
-#define __kmp_get_team() (__kmp_threads[(__kmp_get_gtid())]->th.th_team)
-#define __kmp_team_from_gtid(gtid)                                             \
-  (KMP_DEBUG_ASSERT((gtid) >= 0), __kmp_threads[(gtid)]->th.th_team)
-
-#define __kmp_thread_from_gtid(gtid)                                           \
-  (KMP_DEBUG_ASSERT((gtid) >= 0), __kmp_threads[(gtid)])
-#define __kmp_get_thread() (__kmp_thread_from_gtid(__kmp_get_gtid()))
-
-// Returns current thread (pointer to kmp_info_t). In contrast to
-// __kmp_get_thread(), it works with registered and not-yet-registered threads.
-#define __kmp_gtid_from_thread(thr)                                            \
-  (KMP_DEBUG_ASSERT((thr) != NULL), (thr)->th.th_info.ds.ds_gtid)
-
-// AT: Which way is correct?
-// AT: 1. nproc = __kmp_threads[ ( gtid ) ] -> th.th_team -> t.t_nproc;
-// AT: 2. nproc = __kmp_threads[ ( gtid ) ] -> th.th_team_nproc;
-#define __kmp_get_team_num_threads(gtid)                                       \
-  (__kmp_threads[(gtid)]->th.th_team->t.t_nproc)
-
 /* ------------------------------------------------------------------------ */
 
 #define KMP_UINT64_MAX                                                         \
@@ -940,7 +913,7 @@
 // HW TSC is used to reduce overhead (clock tick instead of nanosecond).
 extern kmp_uint64 __kmp_ticks_per_msec;
 #if KMP_COMPILER_ICC
-#define KMP_NOW() _rdtsc()
+#define KMP_NOW() ((kmp_uint64)_rdtsc())
 #else
 #define KMP_NOW() __kmp_hardware_timestamp()
 #endif
@@ -1010,11 +983,6 @@
 
 #define KMP_MASTER_GTID(gtid) (__kmp_tid_from_gtid((gtid)) == 0)
 #define KMP_WORKER_GTID(gtid) (__kmp_tid_from_gtid((gtid)) != 0)
-#define KMP_UBER_GTID(gtid)                                                    \
-  (KMP_DEBUG_ASSERT((gtid) >= KMP_GTID_MIN),                                   \
-   KMP_DEBUG_ASSERT((gtid) < __kmp_threads_capacity),                          \
-   (gtid) >= 0 && __kmp_root[(gtid)] && __kmp_threads[(gtid)] &&               \
-       (__kmp_threads[(gtid)] == __kmp_root[(gtid)]->r.r_uber_thread))
 #define KMP_INITIAL_GTID(gtid) ((gtid) == 0)
 
 #ifndef TRUE
@@ -1062,9 +1030,9 @@
 // regression after removal of extra PAUSE from KMP_YIELD_SPIN(). Changing
 // the delay from 100 to 300 showed even better performance than double PAUSE
 // on Spec OMP2001 and LCPC tasking tests, no regressions on EPCC.
-static void __kmp_x86_pause(void) { _mm_delay_32(300); }
+static inline void __kmp_x86_pause(void) { _mm_delay_32(300); }
 #else
-static void __kmp_x86_pause(void) { _mm_pause(); }
+static inline void __kmp_x86_pause(void) { _mm_pause(); }
 #endif
 #define KMP_CPU_PAUSE() __kmp_x86_pause()
 #elif KMP_ARCH_PPC64
@@ -1508,6 +1476,30 @@
 
 /* ------------------------------------------------------------------------ */
 
+#if KMP_USE_HIER_SCHED
+// Shared barrier data that exists inside a single unit of the scheduling
+// hierarchy
+typedef struct kmp_hier_private_bdata_t {
+  kmp_int32 num_active;
+  kmp_uint64 index;
+  kmp_uint64 wait_val[2];
+} kmp_hier_private_bdata_t;
+#endif
+
+typedef struct kmp_sched_flags {
+  unsigned ordered : 1;
+  unsigned nomerge : 1;
+  unsigned contains_last : 1;
+#if KMP_USE_HIER_SCHED
+  unsigned use_hier : 1;
+  unsigned unused : 28;
+#else
+  unsigned unused : 29;
+#endif
+} kmp_sched_flags_t;
+
+KMP_BUILD_ASSERT(sizeof(kmp_sched_flags_t) == 4);
+
 #if KMP_STATIC_STEAL_ENABLED
 typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
   kmp_int32 count;
@@ -1625,14 +1617,17 @@
     dispatch_private_info64_t p64;
   } u;
   enum sched_type schedule; /* scheduling algorithm */
-  kmp_int32 ordered; /* ordered clause specified */
+  kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
   kmp_int32 ordered_bumped;
   // To retain the structure size after making ordered_iteration scalar
   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
   // Stack of buffers for nest of serial regions
   struct dispatch_private_info *next;
-  kmp_int32 nomerge; /* don't merge iters if serialized */
   kmp_int32 type_size; /* the size of types in private_info */
+#if KMP_USE_HIER_SCHED
+  kmp_int32 hier_id;
+  void *parent; /* hierarchical scheduling parent pointer */
+#endif
   enum cons_type pushed_ws;
 } dispatch_private_info_t;
 
@@ -1667,6 +1662,9 @@
   volatile kmp_uint32 *doacross_flags; // shared array of iteration flags (0/1)
   kmp_int32 doacross_num_done; // count finished threads
 #endif
+#if KMP_USE_HIER_SCHED
+  void *hier;
+#endif
 #if KMP_USE_HWLOC
   // When linking with libhwloc, the ORDERED EPCC test slows down on big
   // machines (> 48 cores). Performance analysis showed that a cache thrash
@@ -2109,8 +2107,9 @@
 
 #if OMP_40_ENABLED
 typedef struct kmp_taskgroup {
-  kmp_int32 count; // number of allocated and not yet complete tasks
-  kmp_int32 cancel_request; // request for cancellation of this taskgroup
+  std::atomic<kmp_int32> count; // number of allocated and incomplete tasks
+  std::atomic<kmp_int32>
+      cancel_request; // request for cancellation of this taskgroup
   struct kmp_taskgroup *parent; // parent taskgroup
 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
 #if OMP_45_ENABLED
@@ -2149,8 +2148,8 @@
   kmp_uint32 id;
 #endif
 
-  volatile kmp_int32 npredecessors;
-  volatile kmp_int32 nrefs;
+  std::atomic<kmp_int32> npredecessors;
+  std::atomic<kmp_int32> nrefs;
 } kmp_base_depnode_t;
 
 union KMP_ALIGN_CACHE kmp_depnode {
@@ -2242,7 +2241,7 @@
   /* Currently not used except for perhaps IDB */
   kmp_taskdata_t *td_parent; /* parent task                             */
   kmp_int32 td_level; /* task nesting level                      */
-  kmp_int32 td_untied_count; /* untied task active parts counter        */
+  std::atomic<kmp_int32> td_untied_count; // untied task active parts counter
   ident_t *td_ident; /* task identifier                         */
   // Taskwait data.
   ident_t *td_taskwait_ident;
@@ -2250,10 +2249,10 @@
   kmp_int32 td_taskwait_thread; /* gtid + 1 of thread encountered taskwait */
   KMP_ALIGN_CACHE kmp_internal_control_t
       td_icvs; /* Internal control variables for the task */
-  KMP_ALIGN_CACHE volatile kmp_int32
+  KMP_ALIGN_CACHE std::atomic<kmp_int32>
       td_allocated_child_tasks; /* Child tasks (+ current task) not yet
                                    deallocated */
-  volatile kmp_int32
+  std::atomic<kmp_int32>
       td_incomplete_child_tasks; /* Child tasks not yet complete */
 #if OMP_40_ENABLED
   kmp_taskgroup_t
@@ -2338,7 +2337,7 @@
   kmp_int32 tt_untied_task_encountered;
 
   KMP_ALIGN_CACHE
-  volatile kmp_int32 tt_unfinished_threads; /* #threads still active      */
+  std::atomic<kmp_int32> tt_unfinished_threads; /* #threads still active */
 
   KMP_ALIGN_CACHE
   volatile kmp_uint32
@@ -2480,6 +2479,10 @@
   kmp_uint8 th_active_in_pool; // included in count of #active threads in pool
   int th_active; // ! sleeping; 32 bits for TCR/TCW
   struct cons_header *th_cons; // used for consistency check
+#if KMP_USE_HIER_SCHED
+  // used for hierarchical scheduling
+  kmp_hier_private_bdata_t *th_hier_bar_data;
+#endif
 
   /* Add the syncronizing data which is cache aligned and padded. */
   KMP_ALIGN_CACHE kmp_balign_t th_bar[bs_last_barrier];
@@ -2511,6 +2514,9 @@
 #if KMP_STATS_ENABLED
   kmp_stats_list *th_stats;
 #endif
+#if KMP_OS_UNIX
+  std::atomic<bool> th_blocking;
+#endif
 } kmp_base_info_t;
 
 typedef union KMP_ALIGN_CACHE kmp_info {
@@ -2561,7 +2567,7 @@
   // ---------------------------------------------------------------------------
   KMP_ALIGN_CACHE kmp_ordered_team_t t_ordered;
   kmp_balign_team_t t_bar[bs_last_barrier];
-  volatile int t_construct; // count of single directive encountered by team
+  std::atomic<int> t_construct; // count of single directive encountered by team
   char pad[sizeof(kmp_lock_t)]; // padding to maintain performance on big iron
 
   // Master only
@@ -2636,12 +2642,14 @@
 // for SERIALIZED teams nested 2 or more levels deep
 #if OMP_40_ENABLED
   // typed flag to store request state of cancellation
-  kmp_int32 t_cancel_request;
+  std::atomic<kmp_int32> t_cancel_request;
 #endif
   int t_master_active; // save on fork, restore on join
   kmp_taskq_t t_taskq; // this team's task queue
   void *t_copypriv_data; // team specific pointer to copyprivate data array
-  kmp_uint32 t_copyin_counter;
+#if KMP_OS_WINDOWS
+  std::atomic<kmp_uint32> t_copyin_counter;
+#endif
 #if USE_ITT_BUILD
   void *t_stack_id; // team specific stack stitching id (for ittnotify)
 #endif /* USE_ITT_BUILD */
@@ -2685,7 +2693,8 @@
   volatile int r_active; /* TRUE if some region in a nest has > 1 thread */
   // GEH: This is misnamed, should be r_in_parallel
   volatile int r_nested; // TODO: GEH - This is unused, just remove it entirely.
-  int r_in_parallel; /* keeps a count of active parallel regions per root */
+  // keeps a count of active parallel regions per root
+  std::atomic<int> r_in_parallel;
   // GEH: This is misnamed, should be r_active_levels
   kmp_team_t *r_root_team;
   kmp_team_t *r_hot_team;
@@ -2742,8 +2751,8 @@
                                       entry pointer */
 
 extern char *__kmp_debug_buffer; /* Debug buffer itself */
-extern int __kmp_debug_count; /* Counter for number of lines printed in buffer
-                                 so far */
+extern std::atomic<int> __kmp_debug_count; /* Counter for number of lines
+                                              printed in buffer so far */
 extern int __kmp_debug_buf_warn_chars; /* Keep track of char increase
                                           recommended in warnings */
 /* end rotating debug buffer */
@@ -2973,7 +2982,7 @@
 extern kmp_adaptive_backoff_params_t __kmp_adaptive_backoff_params;
 
 #if KMP_DEBUG_ADAPTIVE_LOCKS
-extern char *__kmp_speculative_statsfile;
+extern const char *__kmp_speculative_statsfile;
 #endif
 
 #endif // KMP_USE_ADAPTIVE_LOCKS
@@ -3000,23 +3009,69 @@
    threads, and those in the thread pool */
 extern volatile int __kmp_all_nth;
 extern int __kmp_thread_pool_nth;
-extern volatile int __kmp_thread_pool_active_nth;
+extern std::atomic<int> __kmp_thread_pool_active_nth;
 
 extern kmp_root_t **__kmp_root; /* root of thread hierarchy */
 /* end data protected by fork/join lock */
 /* ------------------------------------------------------------------------- */
 
+#define __kmp_get_gtid() __kmp_get_global_thread_id()
+#define __kmp_entry_gtid() __kmp_get_global_thread_id_reg()
+#define __kmp_get_tid() (__kmp_tid_from_gtid(__kmp_get_gtid()))
+#define __kmp_get_team() (__kmp_threads[(__kmp_get_gtid())]->th.th_team)
+#define __kmp_get_thread() (__kmp_thread_from_gtid(__kmp_get_gtid()))
+
+// AT: Which way is correct?
+// AT: 1. nproc = __kmp_threads[ ( gtid ) ] -> th.th_team -> t.t_nproc;
+// AT: 2. nproc = __kmp_threads[ ( gtid ) ] -> th.th_team_nproc;
+#define __kmp_get_team_num_threads(gtid)                                       \
+  (__kmp_threads[(gtid)]->th.th_team->t.t_nproc)
+
+static inline bool KMP_UBER_GTID(int gtid) {
+  KMP_DEBUG_ASSERT(gtid >= KMP_GTID_MIN);
+  KMP_DEBUG_ASSERT(gtid < __kmp_threads_capacity);
+  return (gtid >= 0 && __kmp_root[gtid] && __kmp_threads[gtid] &&
+          __kmp_threads[gtid] == __kmp_root[gtid]->r.r_uber_thread);
+}
+
+static inline int __kmp_tid_from_gtid(int gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+  return __kmp_threads[gtid]->th.th_info.ds.ds_tid;
+}
+
+static inline int __kmp_gtid_from_tid(int tid, const kmp_team_t *team) {
+  KMP_DEBUG_ASSERT(tid >= 0 && team);
+  return team->t.t_threads[tid]->th.th_info.ds.ds_gtid;
+}
+
+static inline int __kmp_gtid_from_thread(const kmp_info_t *thr) {
+  KMP_DEBUG_ASSERT(thr);
+  return thr->th.th_info.ds.ds_gtid;
+}
+
+static inline kmp_info_t *__kmp_thread_from_gtid(int gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+  return __kmp_threads[gtid];
+}
+
+static inline kmp_team_t *__kmp_team_from_gtid(int gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+  return __kmp_threads[gtid]->th.th_team;
+}
+
+/* ------------------------------------------------------------------------- */
+
 extern kmp_global_t __kmp_global; /* global status */
 
 extern kmp_info_t __kmp_monitor;
-extern volatile kmp_uint32 __kmp_team_counter; // For Debugging Support Library
-extern volatile kmp_uint32 __kmp_task_counter; // For Debugging Support Library
+// For Debugging Support Library
+extern std::atomic<kmp_uint32> __kmp_team_counter;
+// For Debugging Support Library
+extern std::atomic<kmp_uint32> __kmp_task_counter;
 
 #if USE_DEBUGGER
-
 #define _KMP_GEN_ID(counter)                                                   \
-  (__kmp_debugging ? KMP_TEST_THEN_INC32((volatile kmp_int32 *)&counter) + 1   \
-                   : ~0)
+  (__kmp_debugging ? KMP_ATOMIC_INC(&counter) + 1 : ~0)
 #else
 #define _KMP_GEN_ID(counter) (~0)
 #endif /* USE_DEBUGGER */
@@ -3763,11 +3818,12 @@
   kmp_int64 st; // stride
 };
 KMP_EXPORT void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid,
-                                     kmp_int32 num_dims, struct kmp_dim *dims);
+                                     kmp_int32 num_dims,
+                                     const struct kmp_dim *dims);
 KMP_EXPORT void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid,
-                                     kmp_int64 *vec);
+                                     const kmp_int64 *vec);
 KMP_EXPORT void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid,
-                                     kmp_int64 *vec);
+                                     const kmp_int64 *vec);
 KMP_EXPORT void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
 #endif
 
diff --git a/runtime/src/kmp_affinity.cpp b/runtime/src/kmp_affinity.cpp
index b7da8d4..50fc131 100644
--- a/runtime/src/kmp_affinity.cpp
+++ b/runtime/src/kmp_affinity.cpp
@@ -17,6 +17,9 @@
 #include "kmp_io.h"
 #include "kmp_str.h"
 #include "kmp_wrapper_getpid.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
 
 // Store the real or imagined machine hierarchy here
 static hierarchy_info machine_hierarchy;
@@ -88,7 +91,7 @@
   char *end = buf + buf_len - 1;
 
   // Find first element / check for empty set.
-  size_t i;
+  int i;
   i = mask->begin();
   if (i == mask->end()) {
     KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
@@ -98,7 +101,7 @@
     return buf;
   }
 
-  KMP_SNPRINTF(scan, end - scan + 1, "{%ld", (long)i);
+  KMP_SNPRINTF(scan, end - scan + 1, "{%d", i);
   while (*scan != '\0')
     scan++;
   i++;
@@ -114,7 +117,7 @@
     if (end - scan < 15) {
       break;
     }
-    KMP_SNPRINTF(scan, end - scan + 1, ",%-ld", (long)i);
+    KMP_SNPRINTF(scan, end - scan + 1, ",%-d", i);
     while (*scan != '\0')
       scan++;
   }
@@ -837,7 +840,7 @@
   *address2os =
       (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
   int avail_ct = 0;
-  unsigned int i;
+  int i;
   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
     // Skip this proc if it is not included in the machine model.
     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
@@ -955,17 +958,6 @@
   unsigned threadId; //      ""
 };
 
-static int __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a,
-                                                   const void *b) {
-  const apicThreadInfo *aa = (const apicThreadInfo *)a;
-  const apicThreadInfo *bb = (const apicThreadInfo *)b;
-  if (aa->osId < bb->osId)
-    return -1;
-  if (aa->osId > bb->osId)
-    return 1;
-  return 0;
-}
-
 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
                                                      const void *b) {
   const apicThreadInfo *aa = (const apicThreadInfo *)a;
@@ -992,7 +984,6 @@
 static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
                                             kmp_i18n_id_t *const msg_id) {
   kmp_cpuid buf;
-  int rc;
   *address2os = NULL;
   *msg_id = kmp_i18n_null;
 
@@ -1344,7 +1335,7 @@
                __kmp_nThreadsPerCore, __kmp_ncores);
   }
   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
-  KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
+  KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
   for (i = 0; i < nApics; ++i) {
     __kmp_pu_os_idx[i] = threadInfo[i].osId;
@@ -1799,7 +1790,6 @@
     int newPkgLevel = -1;
     int newCoreLevel = -1;
     int newThreadLevel = -1;
-    int i;
     for (level = 0; level < depth; level++) {
       if ((maxCt[level] == 1) && (level != pkgLevel)) {
         // Remove this level. Never remove the package level
@@ -1869,16 +1859,6 @@
 typedef unsigned *ProcCpuInfo;
 static unsigned maxIndex = pkgIdIndex;
 
-static int __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) {
-  const unsigned *aa = (const unsigned *)a;
-  const unsigned *bb = (const unsigned *)b;
-  if (aa[osIdIndex] < bb[osIdIndex])
-    return -1;
-  if (aa[osIdIndex] > bb[osIdIndex])
-    return 1;
-  return 0;
-}
-
 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
                                                   const void *b) {
   unsigned i;
@@ -1895,6 +1875,76 @@
   return 0;
 }
 
+#if KMP_USE_HIER_SCHED
+// Set the array sizes for the hierarchy layers
+static void __kmp_dispatch_set_hierarchy_values() {
+  // Set the maximum number of L1's to number of cores
+  // Set the maximum number of L2's to to either number of cores / 2 for
+  // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
+  // Or the number of cores for Intel(R) Xeon(R) processors
+  // Set the maximum number of NUMA nodes and L3's to number of packages
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
+      nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+  if (__kmp_mic_type >= mic3)
+    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
+  else
+#endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
+  // Set the number of threads per unit
+  // Number of hardware threads per L1/L2/L3/NUMA/LOOP
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
+      __kmp_nThreadsPerCore;
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+  if (__kmp_mic_type >= mic3)
+    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
+        2 * __kmp_nThreadsPerCore;
+  else
+#endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
+        __kmp_nThreadsPerCore;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
+      nCoresPerPkg * __kmp_nThreadsPerCore;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
+      nCoresPerPkg * __kmp_nThreadsPerCore;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
+      nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
+}
+
+// Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
+// i.e., this thread's L1 or this thread's L2, etc.
+int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
+  int index = type + 1;
+  int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
+  KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
+  if (type == kmp_hier_layer_e::LAYER_THREAD)
+    return tid;
+  else if (type == kmp_hier_layer_e::LAYER_LOOP)
+    return 0;
+  KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
+  if (tid >= num_hw_threads)
+    tid = tid % num_hw_threads;
+  return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
+}
+
+// Return the number of t1's per t2
+int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
+  int i1 = t1 + 1;
+  int i2 = t2 + 1;
+  KMP_DEBUG_ASSERT(i1 <= i2);
+  KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
+  KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
+  KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
+  // (nthreads/t2) / (nthreads/t1) = t1 / t2
+  return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
+}
+#endif // KMP_USE_HIER_SCHED
+
 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
 // affinity map.
 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
@@ -2439,7 +2489,7 @@
 #endif // KMP_MIC && REDUCE_TEAM_SIZE
 
   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
-  KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc);
+  KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
   for (i = 0; i < num_avail; ++i) { // fill the os indices
     __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
@@ -2460,7 +2510,6 @@
   // which has a sibling. These levels are in the map, and the package level is
   // always in the map.
   bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
-  int level = 0;
   for (index = threadIdIndex; index < maxIndex; index++) {
     KMP_ASSERT(totals[index] >= totals[index + 1]);
     inMap[index] = (totals[index] > totals[index + 1]);
@@ -3953,12 +4002,22 @@
 static int *procarr = NULL;
 static int __kmp_aff_depth = 0;
 
+#if KMP_USE_HIER_SCHED
+#define KMP_EXIT_AFF_NONE                                                      \
+  KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
+  KMP_ASSERT(address2os == NULL);                                              \
+  __kmp_apply_thread_places(NULL, 0);                                          \
+  __kmp_create_affinity_none_places();                                         \
+  __kmp_dispatch_set_hierarchy_values();                                       \
+  return;
+#else
 #define KMP_EXIT_AFF_NONE                                                      \
   KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
   KMP_ASSERT(address2os == NULL);                                              \
   __kmp_apply_thread_places(NULL, 0);                                          \
   __kmp_create_affinity_none_places();                                         \
   return;
+#endif
 
 // Create a one element mask array (set of places) which only contains the
 // initial process's affinity mask
@@ -4300,6 +4359,10 @@
     KMP_ASSERT(address2os != NULL);
   }
 
+#if KMP_USE_HIER_SCHED
+  __kmp_dispatch_set_hierarchy_values();
+#endif
+
   if (address2os == NULL) {
     if (KMP_AFFINITY_CAPABLE() &&
         (__kmp_affinity_verbose ||
@@ -4685,8 +4748,6 @@
 #if OMP_40_ENABLED
 
 void __kmp_affinity_set_place(int gtid) {
-  int retval;
-
   if (!KMP_AFFINITY_CAPABLE()) {
     return;
   }
@@ -4855,8 +4916,6 @@
 }
 
 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
-  int retval;
-
   if (!KMP_AFFINITY_CAPABLE()) {
     return -1;
   }
@@ -4889,8 +4948,6 @@
 }
 
 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
-  int retval;
-
   if (!KMP_AFFINITY_CAPABLE()) {
     return -1;
   }
@@ -4923,8 +4980,6 @@
 }
 
 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
-  int retval;
-
   if (!KMP_AFFINITY_CAPABLE()) {
     return -1;
   }
diff --git a/runtime/src/kmp_affinity.h b/runtime/src/kmp_affinity.h
index 415a4f5..cb1a7e3 100644
--- a/runtime/src/kmp_affinity.h
+++ b/runtime/src/kmp_affinity.h
@@ -81,13 +81,12 @@
       return error;
     }
     int get_proc_group() const override {
-      int i;
       int group = -1;
 #if KMP_OS_WINDOWS
       if (__kmp_num_proc_groups == 1) {
         return 1;
       }
-      for (i = 0; i < __kmp_num_proc_groups; i++) {
+      for (int i = 0; i < __kmp_num_proc_groups; i++) {
         // On windows, the long type is always 32 bits
         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
         unsigned long second_32_bits =
@@ -334,7 +333,7 @@
   void deallocate_mask(KMPAffinity::Mask *m) override {
     KMPNativeAffinity::Mask *native_mask =
         static_cast<KMPNativeAffinity::Mask *>(m);
-    delete m;
+    delete native_mask;
   }
   KMPAffinity::Mask *allocate_mask_array(int num) override {
     return new Mask[num];
diff --git a/runtime/src/kmp_alloc.cpp b/runtime/src/kmp_alloc.cpp
index 8d584f7..8f7f47c 100644
--- a/runtime/src/kmp_alloc.cpp
+++ b/runtime/src/kmp_alloc.cpp
@@ -53,16 +53,6 @@
                   bget_acquire_t acquire, bget_release_t release,
                   bufsize pool_incr);
 
-#ifdef KMP_DEBUG
-static void bstats(kmp_info_t *th, bufsize *curalloc, bufsize *totfree,
-                   bufsize *maxfree, long *nget, long *nrel);
-static void bstatse(kmp_info_t *th, bufsize *pool_incr, long *npool,
-                    long *npget, long *nprel, long *ndget, long *ndrel);
-static void bufdump(kmp_info_t *th, void *buf);
-static void bpoold(kmp_info_t *th, void *pool, int dumpalloc, int dumpfree);
-static int bpoolv(kmp_info_t *th, void *pool);
-#endif
-
 /* BGET CONFIGURATION */
 /* Buffer allocation size quantum: all buffers allocated are a
    multiple of this size.  This MUST be a power of two. */
@@ -271,23 +261,6 @@
   return data;
 }
 
-#ifdef KMP_DEBUG
-
-static void __kmp_bget_validate_queue(kmp_info_t *th) {
-  /* NOTE: assume that the global_lock is held */
-
-  void *p = (void *)th->th.th_local.bget_list;
-
-  while (p != 0) {
-    bfhead_t *b = BFH(((char *)p) - sizeof(bhead_t));
-
-    KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
-    p = (void *)b->ql.flink;
-  }
-}
-
-#endif
-
 /* Walk the free list and release the enqueued buffers */
 static void __kmp_bget_dequeue(kmp_info_t *th) {
   void *p = TCR_SYNC_PTR(th->th.th_local.bget_list);
@@ -1017,197 +990,6 @@
     __kmp_printf_no_lock("__kmp_printpool: T#%d No free blocks\n", gtid);
 }
 
-#ifdef KMP_DEBUG
-
-#if BufStats
-
-/*  BSTATS  --  Return buffer allocation free space statistics.  */
-static void bstats(kmp_info_t *th, bufsize *curalloc, bufsize *totfree,
-                   bufsize *maxfree, long *nget, long *nrel) {
-  int bin = 0;
-  thr_data_t *thr = get_thr_data(th);
-
-  *nget = thr->numget;
-  *nrel = thr->numrel;
-  *curalloc = (bufsize)thr->totalloc;
-  *totfree = 0;
-  *maxfree = -1;
-
-  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
-    bfhead_t *b = thr->freelist[bin].ql.flink;
-
-    while (b != &thr->freelist[bin]) {
-      KMP_DEBUG_ASSERT(b->bh.bb.bsize > 0);
-      *totfree += b->bh.bb.bsize;
-      if (b->bh.bb.bsize > *maxfree) {
-        *maxfree = b->bh.bb.bsize;
-      }
-      b = b->ql.flink; /* Link to next buffer */
-    }
-  }
-}
-
-/*  BSTATSE  --  Return extended statistics  */
-static void bstatse(kmp_info_t *th, bufsize *pool_incr, long *npool,
-                    long *npget, long *nprel, long *ndget, long *ndrel) {
-  thr_data_t *thr = get_thr_data(th);
-
-  *pool_incr = (thr->pool_len < 0) ? -thr->exp_incr : thr->exp_incr;
-  *npool = thr->numpblk;
-  *npget = thr->numpget;
-  *nprel = thr->numprel;
-  *ndget = thr->numdget;
-  *ndrel = thr->numdrel;
-}
-
-#endif /* BufStats */
-
-/*  BUFDUMP  --  Dump the data in a buffer.  This is called with the  user
-                 data pointer, and backs up to the buffer header.  It will
-                 dump either a free block or an allocated one.  */
-static void bufdump(kmp_info_t *th, void *buf) {
-  bfhead_t *b;
-  unsigned char *bdump;
-  bufsize bdlen;
-
-  b = BFH(((char *)buf) - sizeof(bhead_t));
-  KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
-  if (b->bh.bb.bsize < 0) {
-    bdump = (unsigned char *)buf;
-    bdlen = (-b->bh.bb.bsize) - (bufsize)sizeof(bhead_t);
-  } else {
-    bdump = (unsigned char *)(((char *)b) + sizeof(bfhead_t));
-    bdlen = b->bh.bb.bsize - (bufsize)sizeof(bfhead_t);
-  }
-
-  while (bdlen > 0) {
-    int i, dupes = 0;
-    bufsize l = bdlen;
-    char bhex[50], bascii[20];
-
-    if (l > 16) {
-      l = 16;
-    }
-
-    for (i = 0; i < l; i++) {
-      (void)KMP_SNPRINTF(bhex + i * 3, sizeof(bhex) - i * 3, "%02X ", bdump[i]);
-      if (bdump[i] > 0x20 && bdump[i] < 0x7F)
-        bascii[i] = bdump[i];
-      else
-        bascii[i] = ' ';
-    }
-    bascii[i] = 0;
-    (void)__kmp_printf_no_lock("%-48s   %s\n", bhex, bascii);
-    bdump += l;
-    bdlen -= l;
-    while ((bdlen > 16) &&
-           (memcmp((char *)(bdump - 16), (char *)bdump, 16) == 0)) {
-      dupes++;
-      bdump += 16;
-      bdlen -= 16;
-    }
-    if (dupes > 1) {
-      (void)__kmp_printf_no_lock(
-          "     (%d lines [%d bytes] identical to above line skipped)\n", dupes,
-          dupes * 16);
-    } else if (dupes == 1) {
-      bdump -= 16;
-      bdlen += 16;
-    }
-  }
-}
-
-/*  BPOOLD  --  Dump a buffer pool.  The buffer headers are always listed.
-                If DUMPALLOC is nonzero, the contents of allocated buffers
-                are  dumped.   If  DUMPFREE  is  nonzero,  free blocks are
-                dumped as well.  If FreeWipe  checking  is  enabled,  free
-                blocks  which  have  been clobbered will always be dumped. */
-static void bpoold(kmp_info_t *th, void *buf, int dumpalloc, int dumpfree) {
-  bfhead_t *b = BFH((char *)buf - sizeof(bhead_t));
-
-  while (b->bh.bb.bsize != ESent) {
-    bufsize bs = b->bh.bb.bsize;
-
-    if (bs < 0) {
-      bs = -bs;
-      (void)__kmp_printf_no_lock("Allocated buffer: size %6ld bytes.\n",
-                                 (long)bs);
-      if (dumpalloc) {
-        bufdump(th, (void *)(((char *)b) + sizeof(bhead_t)));
-      }
-    } else {
-      const char *lerr = "";
-
-      KMP_DEBUG_ASSERT(bs > 0);
-      if ((b->ql.blink->ql.flink != b) || (b->ql.flink->ql.blink != b)) {
-        lerr = "  (Bad free list links)";
-      }
-      (void)__kmp_printf_no_lock("Free block:       size %6ld bytes.%s\n",
-                                 (long)bs, lerr);
-#ifdef FreeWipe
-      lerr = ((char *)b) + sizeof(bfhead_t);
-      if ((bs > sizeof(bfhead_t)) &&
-          ((*lerr != 0x55) ||
-           (memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=
-            0))) {
-        (void)__kmp_printf_no_lock(
-            "(Contents of above free block have been overstored.)\n");
-        bufdump(th, (void *)(((char *)b) + sizeof(bhead_t)));
-      } else
-#endif
-          if (dumpfree) {
-        bufdump(th, (void *)(((char *)b) + sizeof(bhead_t)));
-      }
-    }
-    b = BFH(((char *)b) + bs);
-  }
-}
-
-/*  BPOOLV  --  Validate a buffer pool. */
-static int bpoolv(kmp_info_t *th, void *buf) {
-  bfhead_t *b = BFH(buf);
-
-  while (b->bh.bb.bsize != ESent) {
-    bufsize bs = b->bh.bb.bsize;
-
-    if (bs < 0) {
-      bs = -bs;
-    } else {
-#ifdef FreeWipe
-      char *lerr = "";
-#endif
-
-      KMP_DEBUG_ASSERT(bs > 0);
-      if (bs <= 0) {
-        return 0;
-      }
-      if ((b->ql.blink->ql.flink != b) || (b->ql.flink->ql.blink != b)) {
-        (void)__kmp_printf_no_lock(
-            "Free block: size %6ld bytes.  (Bad free list links)\n", (long)bs);
-        KMP_DEBUG_ASSERT(0);
-        return 0;
-      }
-#ifdef FreeWipe
-      lerr = ((char *)b) + sizeof(bfhead_t);
-      if ((bs > sizeof(bfhead_t)) &&
-          ((*lerr != 0x55) ||
-           (memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=
-            0))) {
-        (void)__kmp_printf_no_lock(
-            "(Contents of above free block have been overstored.)\n");
-        bufdump(th, (void *)(((char *)b) + sizeof(bhead_t)));
-        KMP_DEBUG_ASSERT(0);
-        return 0;
-      }
-#endif /* FreeWipe */
-    }
-    b = BFH(((char *)b) + bs);
-  }
-  return 1;
-}
-
-#endif /* KMP_DEBUG */
-
 void __kmp_initialize_bget(kmp_info_t *th) {
   KMP_DEBUG_ASSERT(SizeQuant >= sizeof(void *) && (th != 0));
 
@@ -1548,33 +1330,6 @@
   return ptr;
 } // func ___kmp_allocate
 
-#if (BUILD_MEMORY == FIRST_TOUCH)
-void *__kmp_ft_page_allocate(size_t size) {
-  void *adr, *aadr;
-
-  const int page_size = KMP_GET_PAGE_SIZE();
-
-  adr = (void *)__kmp_thread_malloc(__kmp_get_thread(),
-                                    size + page_size + KMP_PTR_SKIP);
-  if (adr == 0)
-    KMP_FATAL(OutOfHeapMemory);
-
-  /* check to see if adr is on a page boundary. */
-  if (((kmp_uintptr_t)adr & (page_size - 1)) == 0)
-    /* nothing to do if adr is already on a page boundary. */
-    aadr = adr;
-  else
-    /* else set aadr to the first page boundary in the allocated memory. */
-    aadr = (void *)(((kmp_uintptr_t)adr + page_size) & ~(page_size - 1));
-
-  /* the first touch by the owner thread. */
-  *((void **)aadr) = adr;
-
-  /* skip the memory space used for storing adr above. */
-  return (void *)((char *)aadr + KMP_PTR_SKIP);
-}
-#endif
-
 /* Allocate memory on page boundary, fill allocated memory with 0x00.
    Does not call this func directly! Use __kmp_page_allocate macro instead.
    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
diff --git a/runtime/src/kmp_atomic.cpp b/runtime/src/kmp_atomic.cpp
index b099eb6..437f158 100644
--- a/runtime/src/kmp_atomic.cpp
+++ b/runtime/src/kmp_atomic.cpp
@@ -2906,7 +2906,6 @@
 #define ATOMIC_CMPXCHG_CPT_REV(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)      \
   ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
   TYPE new_value;                                                              \
-  TYPE KMP_ATOMIC_VOLATILE temp_val;                                           \
   OP_GOMP_CRITICAL_CPT_REV(OP, GOMP_FLAG)                                      \
   OP_CMPXCHG_CPT_REV(TYPE, BITS, OP)                                           \
   }
@@ -2978,7 +2977,6 @@
 #define ATOMIC_CRITICAL_CPT_REV(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)   \
   ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
   TYPE new_value;                                                              \
-  TYPE KMP_ATOMIC_VOLATILE temp_val;                                           \
   /*printf("__kmp_atomic_mode = %d\n", __kmp_atomic_mode);*/                   \
   OP_GOMP_CRITICAL_CPT_REV(OP, GOMP_FLAG)                                      \
   OP_CRITICAL_CPT_REV(OP, LCK_ID)                                              \
diff --git a/runtime/src/kmp_barrier.cpp b/runtime/src/kmp_barrier.cpp
index ca107cd..e817549 100644
--- a/runtime/src/kmp_barrier.cpp
+++ b/runtime/src/kmp_barrier.cpp
@@ -956,14 +956,12 @@
   // All subordinates are gathered; now release parent if not master thread
 
   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
-    KA_TRACE(
-        20,
-        ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
-         "arrived(%p): %llu => %llu\n",
-         gtid, team->t.t_id, tid,
-         __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
-         thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
-         thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
+                  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
+                  gtid, team->t.t_id, tid,
+                  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
+                  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
+                  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
     /* Mark arrival to parent: After performing this write, a worker thread may
        not assume that the team is valid any more - it could be deallocated by
        the master thread at any time. */
@@ -973,8 +971,8 @@
       ANNOTATE_BARRIER_BEGIN(this_thr);
       kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
       flag.release();
-    } else { // Leaf does special release on the "offset" bits of parent's
-      // b_arrived flag
+    } else {
+      // Leaf does special release on "offset" bits of parent's b_arrived flag
       thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
       kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
       flag.set_waiter(other_threads[thr_bar->parent_tid]);
@@ -1225,7 +1223,6 @@
   kmp_info_t *this_thr = __kmp_threads[gtid];
   kmp_team_t *team = this_thr->th.th_team;
   int status = 0;
-  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
 #if OMPT_SUPPORT
   ompt_data_t *my_task_data;
   ompt_data_t *my_parallel_data;
@@ -1353,10 +1350,10 @@
 #endif
 
 #if OMP_40_ENABLED
+      kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
       // Reset cancellation flag for worksharing constructs
-      if (team->t.t_cancel_request == cancel_loop ||
-          team->t.t_cancel_request == cancel_sections) {
-        team->t.t_cancel_request = cancel_noreq;
+      if (cancel_request == cancel_loop || cancel_request == cancel_sections) {
+        KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
       }
 #endif
 #if USE_ITT_BUILD
@@ -1375,6 +1372,7 @@
           this_thr->th.th_teams_microtask == NULL &&
 #endif
           team->t.t_active_level == 1) {
+        ident_t *loc = __kmp_threads[gtid]->th.th_ident;
         kmp_uint64 cur_time = __itt_get_timestamp();
         kmp_info_t **other_threads = team->t.t_threads;
         int nproc = this_thr->th.th_team_nproc;
diff --git a/runtime/src/kmp_cancel.cpp b/runtime/src/kmp_cancel.cpp
index 71c71ec..3895f97 100644
--- a/runtime/src/kmp_cancel.cpp
+++ b/runtime/src/kmp_cancel.cpp
@@ -51,8 +51,8 @@
       {
         kmp_team_t *this_team = this_thr->th.th_team;
         KMP_DEBUG_ASSERT(this_team);
-        kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(
-            &(this_team->t.t_cancel_request), cancel_noreq, cncl_kind);
+        kmp_int32 old = cancel_noreq;
+        this_team->t.t_cancel_request.compare_exchange_strong(old, cncl_kind);
         if (old == cancel_noreq || old == cncl_kind) {
 // we do not have a cancellation request in this team or we do have
 // one that matches the current request -> cancel
@@ -89,8 +89,8 @@
 
         taskgroup = task->td_taskgroup;
         if (taskgroup) {
-          kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(
-              &(taskgroup->cancel_request), cancel_noreq, cncl_kind);
+          kmp_int32 old = cancel_noreq;
+          taskgroup->cancel_request.compare_exchange_strong(old, cncl_kind);
           if (old == cancel_noreq || old == cncl_kind) {
 // we do not have a cancellation request in this taskgroup or we do
 // have one that matches the current request -> cancel
@@ -257,7 +257,7 @@
   if (__kmp_omp_cancellation) {
     // depending on which construct to cancel, check the flag and
     // reset the flag
-    switch (this_team->t.t_cancel_request) {
+    switch (KMP_ATOMIC_LD_RLX(&(this_team->t.t_cancel_request))) {
     case cancel_parallel:
       ret = 1;
       // ensure that threads have checked the flag, when
diff --git a/runtime/src/kmp_config.h.cmake b/runtime/src/kmp_config.h.cmake
index 571658a..04fc52b 100644
--- a/runtime/src/kmp_config.h.cmake
+++ b/runtime/src/kmp_config.h.cmake
@@ -54,6 +54,8 @@
 #define KMP_USE_INTERNODE_ALIGNMENT LIBOMP_USE_INTERNODE_ALIGNMENT
 #cmakedefine01 LIBOMP_ENABLE_ASSERTIONS
 #define KMP_USE_ASSERT LIBOMP_ENABLE_ASSERTIONS
+#cmakedefine01 LIBOMP_USE_HIER_SCHED
+#define KMP_USE_HIER_SCHED LIBOMP_USE_HIER_SCHED
 #cmakedefine01 STUBS_LIBRARY
 #cmakedefine01 LIBOMP_USE_HWLOC
 #define KMP_USE_HWLOC LIBOMP_USE_HWLOC
diff --git a/runtime/src/kmp_csupport.cpp b/runtime/src/kmp_csupport.cpp
index 7892e78..9729f39 100644
--- a/runtime/src/kmp_csupport.cpp
+++ b/runtime/src/kmp_csupport.cpp
@@ -262,6 +262,14 @@
   int gtid = __kmp_entry_gtid();
 
 #if (KMP_STATS_ENABLED)
+  // If we were in a serial region, then stop the serial timer, record
+  // the event, and start parallel region timer
+  stats_state_e previous_state = KMP_GET_THREAD_STATE();
+  if (previous_state == stats_state_e::SERIAL_REGION) {
+    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
+  } else {
+    KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
+  }
   int inParallel = __kmpc_in_parallel(loc);
   if (inParallel) {
     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
@@ -318,6 +326,14 @@
 
     va_end(ap);
   }
+
+#if KMP_STATS_ENABLED
+  if (previous_state == stats_state_e::SERIAL_REGION) {
+    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
+  } else {
+    KMP_POP_PARTITIONED_TIMER();
+  }
+#endif // KMP_STATS_ENABLED
 }
 
 #if OMP_40_ENABLED
@@ -930,9 +946,10 @@
 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
   {                                                                            \
     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
-    if (l->lk.poll != KMP_LOCK_FREE(tas) ||                                    \
-        !KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas),        \
-                                     KMP_LOCK_BUSY(gtid + 1, tas))) {          \
+    kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
+    kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
+    if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
+        !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
       kmp_uint32 spins;                                                        \
       KMP_FSYNC_PREPARE(l);                                                    \
       KMP_INIT_YIELD(spins);                                                   \
@@ -943,9 +960,9 @@
         KMP_YIELD_SPIN(spins);                                                 \
       }                                                                        \
       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
-      while (l->lk.poll != KMP_LOCK_FREE(tas) ||                               \
-             !KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas),   \
-                                          KMP_LOCK_BUSY(gtid + 1, tas))) {     \
+      while (                                                                  \
+          KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
+          !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {  \
         __kmp_spin_backoff(&backoff);                                          \
         if (TCR_4(__kmp_nth) >                                                 \
             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
@@ -962,17 +979,15 @@
 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
   {                                                                            \
     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
-    rc = l->lk.poll == KMP_LOCK_FREE(tas) &&                                   \
-         KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas),        \
-                                     KMP_LOCK_BUSY(gtid + 1, tas));            \
+    kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
+    kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
+    rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
+         __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
   }
 
 // Fast-path release tas lock
 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
-  {                                                                            \
-    TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas));              \
-    KMP_MB();                                                                  \
-  }
+  { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
 
 #if KMP_USE_FUTEX
 
@@ -1116,8 +1131,6 @@
   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
 #else
   KMP_COUNT_BLOCK(OMP_CRITICAL);
-  KMP_TIME_PARTITIONED_BLOCK(
-      OMP_critical_wait); /* Time spent waiting to enter the critical section */
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   omp_state_t prev_state = omp_state_undefined;
   ompt_thread_info_t ti;
@@ -1128,6 +1141,7 @@
 
   // TODO: add THR_OVHD_STATE
 
+  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
   KMP_CHECK_USER_LOCK_INIT();
 
   if ((__kmp_user_lock_kind == lk_tas) &&
@@ -1194,8 +1208,9 @@
     }
   }
 #endif
+  KMP_POP_PARTITIONED_TIMER();
 
-  KMP_START_EXPLICIT_TIMER(OMP_critical);
+  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
 #endif // KMP_USE_DYNAMIC_LOCK
 }
@@ -1247,6 +1262,7 @@
 }
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
+#if KMP_USE_DYNAMIC_LOCK
 static kmp_mutex_impl_t
 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
   if (user_lock) {
@@ -1291,7 +1307,7 @@
     return ompt_mutex_impl_unknown;
   }
 }
-
+#else
 // For locks without dynamic binding
 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
   switch (__kmp_user_lock_kind) {
@@ -1314,7 +1330,8 @@
     return ompt_mutex_impl_unknown;
   }
 }
-#endif
+#endif // KMP_USE_DYNAMIC_LOCK
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
 
 /*!
 @ingroup WORK_SHARING
@@ -1346,6 +1363,7 @@
 
   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
   // Check if it is initialized.
+  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
   if (*lk == 0) {
     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
     if (KMP_IS_D_LOCK(lckseq)) {
@@ -1423,6 +1441,7 @@
 #endif
     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
   }
+  KMP_POP_PARTITIONED_TIMER();
 
 #if USE_ITT_BUILD
   __kmp_itt_critical_acquired(lck);
@@ -1754,6 +1773,7 @@
 Mark the end of a statically scheduled loop.
 */
 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
+  KMP_POP_PARTITIONED_TIMER();
   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
@@ -1780,7 +1800,6 @@
         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
-
   if (__kmp_env_consistency_check)
     __kmp_pop_workshare(global_tid, ct_pdo, loc);
 }
@@ -3765,7 +3784,7 @@
 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
 */
 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
-                          struct kmp_dim *dims) {
+                          const struct kmp_dim *dims) {
   int j, idx;
   kmp_int64 last, trace_count;
   kmp_info_t *th = __kmp_threads[gtid];
@@ -3884,7 +3903,7 @@
   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
 }
 
-void __kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec) {
+void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
   kmp_int32 shft, num_dims, i;
   kmp_uint32 flag;
   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
@@ -3977,7 +3996,7 @@
             gtid, (iter_number << 5) + shft));
 }
 
-void __kmpc_doacross_post(ident_t *loc, int gtid, long long *vec) {
+void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
   kmp_int32 shft, num_dims, i;
   kmp_uint32 flag;
   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
@@ -4059,6 +4078,7 @@
         __kmp_dispatch_num_buffers; // free buffer for future re-use
   }
   // free private resources (need to keep buffer index forever)
+  pr_buf->th_doacross_flags = NULL;
   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
   pr_buf->th_doacross_info = NULL;
   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
diff --git a/runtime/src/kmp_debug.h b/runtime/src/kmp_debug.h
index 1382dfe..29cc15c 100644
--- a/runtime/src/kmp_debug.h
+++ b/runtime/src/kmp_debug.h
@@ -35,24 +35,33 @@
 extern int __kmp_debug_assert(char const *expr, char const *file, int line);
 #ifdef KMP_DEBUG
 #define KMP_ASSERT(cond)                                                       \
-  ((cond) ? 0 : __kmp_debug_assert(#cond, __FILE__, __LINE__))
+  if (!(cond)) {                                                               \
+    __kmp_debug_assert(#cond, __FILE__, __LINE__);                             \
+  }
 #define KMP_ASSERT2(cond, msg)                                                 \
-  ((cond) ? 0 : __kmp_debug_assert((msg), __FILE__, __LINE__))
+  if (!(cond)) {                                                               \
+    __kmp_debug_assert((msg), __FILE__, __LINE__);                             \
+  }
 #define KMP_DEBUG_ASSERT(cond) KMP_ASSERT(cond)
 #define KMP_DEBUG_ASSERT2(cond, msg) KMP_ASSERT2(cond, msg)
+#define KMP_DEBUG_USE_VAR(x) /* Nothing (it is used!) */
 #else
 // Do not expose condition in release build. Use "assertion failure".
 #define KMP_ASSERT(cond)                                                       \
-  ((cond) ? 0 : __kmp_debug_assert("assertion failure", __FILE__, __LINE__))
+  if (!(cond)) {                                                               \
+    __kmp_debug_assert("assertion failure", __FILE__, __LINE__);               \
+  }
 #define KMP_ASSERT2(cond, msg) KMP_ASSERT(cond)
-#define KMP_DEBUG_ASSERT(cond) 0
-#define KMP_DEBUG_ASSERT2(cond, msg) 0
+#define KMP_DEBUG_ASSERT(cond) /* Nothing */
+#define KMP_DEBUG_ASSERT2(cond, msg) /* Nothing */
+#define KMP_DEBUG_USE_VAR(x) ((void)(x))
 #endif // KMP_DEBUG
 #else
-#define KMP_ASSERT(cond) 0
-#define KMP_ASSERT2(cond, msg) 0
-#define KMP_DEBUG_ASSERT(cond) 0
-#define KMP_DEBUG_ASSERT2(cond, msg) 0
+#define KMP_ASSERT(cond) /* Nothing */
+#define KMP_ASSERT2(cond, msg) /* Nothing */
+#define KMP_DEBUG_ASSERT(cond) /* Nothing */
+#define KMP_DEBUG_ASSERT2(cond, msg) /* Nothing */
+#define KMP_DEBUG_USE_VAR(x) ((void)(x))
 #endif // KMP_USE_ASSERT
 
 #ifdef KMP_DEBUG
diff --git a/runtime/src/kmp_debugger.cpp b/runtime/src/kmp_debugger.cpp
index 3484f7a..c02d251 100644
--- a/runtime/src/kmp_debugger.cpp
+++ b/runtime/src/kmp_debugger.cpp
@@ -68,7 +68,9 @@
     addr_and_size_of(__kmp_threads),
     addr_and_size_of(__kmp_root),
     addr_and_size_of(__kmp_threads_capacity),
+#if KMP_USE_MONITOR
     addr_and_size_of(__kmp_monitor),
+#endif
 #if !KMP_USE_DYNAMIC_LOCK
     addr_and_size_of(__kmp_user_lock_table),
 #endif
diff --git a/runtime/src/kmp_dispatch.cpp b/runtime/src/kmp_dispatch.cpp
index a821ec1..8e471ab 100644
--- a/runtime/src/kmp_dispatch.cpp
+++ b/runtime/src/kmp_dispatch.cpp
@@ -34,275 +34,20 @@
 #if KMP_OS_WINDOWS && KMP_ARCH_X86
 #include <float.h>
 #endif
+#include "kmp_lock.h"
+#include "kmp_dispatch.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
 
 #if OMPT_SUPPORT
 #include "ompt-specific.h"
 #endif
 
 /* ------------------------------------------------------------------------ */
-
-#if KMP_STATIC_STEAL_ENABLED
-
-// replaces dispatch_private_info{32,64} structures and
-// dispatch_private_info{32,64}_t types
-template <typename T> struct dispatch_private_infoXX_template {
-  typedef typename traits_t<T>::unsigned_t UT;
-  typedef typename traits_t<T>::signed_t ST;
-  UT count; // unsigned
-  T ub;
-  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
-  T lb;
-  ST st; // signed
-  UT tc; // unsigned
-  T static_steal_counter; // for static_steal only; maybe better to put after ub
-
-  /* parm[1-4] are used in different ways by different scheduling algorithms */
-
-  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
-  //    a) parm3 is properly aligned and
-  //    b) all parm1-4 are in the same cache line.
-  // Because of parm1-4 are used together, performance seems to be better
-  // if they are in the same line (not measured though).
-
-  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
-    T parm1;
-    T parm2;
-    T parm3;
-    T parm4;
-  };
-
-  UT ordered_lower; // unsigned
-  UT ordered_upper; // unsigned
-#if KMP_OS_WINDOWS
-  T last_upper;
-#endif /* KMP_OS_WINDOWS */
-};
-
-#else /* KMP_STATIC_STEAL_ENABLED */
-
-// replaces dispatch_private_info{32,64} structures and
-// dispatch_private_info{32,64}_t types
-template <typename T> struct dispatch_private_infoXX_template {
-  typedef typename traits_t<T>::unsigned_t UT;
-  typedef typename traits_t<T>::signed_t ST;
-  T lb;
-  T ub;
-  ST st; // signed
-  UT tc; // unsigned
-
-  T parm1;
-  T parm2;
-  T parm3;
-  T parm4;
-
-  UT count; // unsigned
-
-  UT ordered_lower; // unsigned
-  UT ordered_upper; // unsigned
-#if KMP_OS_WINDOWS
-  T last_upper;
-#endif /* KMP_OS_WINDOWS */
-};
-
-#endif /* KMP_STATIC_STEAL_ENABLED */
-
-// replaces dispatch_private_info structure and dispatch_private_info_t type
-template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
-  // duplicate alignment here, otherwise size of structure is not correct in our
-  // compiler
-  union KMP_ALIGN_CACHE private_info_tmpl {
-    dispatch_private_infoXX_template<T> p;
-    dispatch_private_info64_t p64;
-  } u;
-  enum sched_type schedule; /* scheduling algorithm */
-  kmp_uint32 ordered; /* ordered clause specified */
-  kmp_uint32 ordered_bumped;
-  // To retain the structure size after making ordered_iteration scalar
-  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
-  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
-  kmp_uint32 nomerge; /* don't merge iters if serialized */
-  kmp_uint32 type_size;
-  enum cons_type pushed_ws;
-};
-
-// replaces dispatch_shared_info{32,64} structures and
-// dispatch_shared_info{32,64}_t types
-template <typename UT> struct dispatch_shared_infoXX_template {
-  /* chunk index under dynamic, number of idle threads under static-steal;
-     iteration index otherwise */
-  volatile UT iteration;
-  volatile UT num_done;
-  volatile UT ordered_iteration;
-  // to retain the structure size making ordered_iteration scalar
-  UT ordered_dummy[KMP_MAX_ORDERED - 3];
-};
-
-// replaces dispatch_shared_info structure and dispatch_shared_info_t type
-template <typename UT> struct dispatch_shared_info_template {
-  // we need union here to keep the structure size
-  union shared_info_tmpl {
-    dispatch_shared_infoXX_template<UT> s;
-    dispatch_shared_info64_t s64;
-  } u;
-  volatile kmp_uint32 buffer_index;
-#if OMP_45_ENABLED
-  volatile kmp_int32 doacross_buf_idx; // teamwise index
-  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
-  kmp_int32 doacross_num_done; // count finished threads
-#endif
-#if KMP_USE_HWLOC
-  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
-  // machines (> 48 cores). Performance analysis showed that a cache thrash
-  // was occurring and this padding helps alleviate the problem.
-  char padding[64];
-#endif
-};
-
 /* ------------------------------------------------------------------------ */
 
-#undef USE_TEST_LOCKS
-
-// test_then_add template (general template should NOT be used)
-template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
-
-template <>
-__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
-                                                 kmp_int32 d) {
-  kmp_int32 r;
-  r = KMP_TEST_THEN_ADD32(p, d);
-  return r;
-}
-
-template <>
-__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
-                                                 kmp_int64 d) {
-  kmp_int64 r;
-  r = KMP_TEST_THEN_ADD64(p, d);
-  return r;
-}
-
-// test_then_inc_acq template (general template should NOT be used)
-template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
-
-template <>
-__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
-  kmp_int32 r;
-  r = KMP_TEST_THEN_INC_ACQ32(p);
-  return r;
-}
-
-template <>
-__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
-  kmp_int64 r;
-  r = KMP_TEST_THEN_INC_ACQ64(p);
-  return r;
-}
-
-// test_then_inc template (general template should NOT be used)
-template <typename T> static __forceinline T test_then_inc(volatile T *p);
-
-template <>
-__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
-  kmp_int32 r;
-  r = KMP_TEST_THEN_INC32(p);
-  return r;
-}
-
-template <>
-__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
-  kmp_int64 r;
-  r = KMP_TEST_THEN_INC64(p);
-  return r;
-}
-
-// compare_and_swap template (general template should NOT be used)
-template <typename T>
-static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
-
-template <>
-__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
-                                                    kmp_int32 c, kmp_int32 s) {
-  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
-}
-
-template <>
-__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
-                                                    kmp_int64 c, kmp_int64 s) {
-  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
-}
-
-/* Spin wait loop that first does pause, then yield.
-    Waits until function returns non-zero when called with *spinner and check.
-    Does NOT put threads to sleep.
-    Arguments:
-        UT is unsigned 4- or 8-byte type
-        spinner - memory location to check value
-        checker - value which spinner is >, <, ==, etc.
-        pred - predicate function to perform binary comparison of some sort
-#if USE_ITT_BUILD
-        obj -- is higher-level synchronization object to report to ittnotify.
-        It is used to report locks consistently. For example, if lock is
-        acquired immediately, its address is reported to ittnotify via
-        KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
-        and lock routine calls to KMP_WAIT_YIELD(), the later should report the
-        same address, not an address of low-level spinner.
-#endif // USE_ITT_BUILD
-    TODO: make inline function (move to header file for icl)
-*/
-template <typename UT>
-static UT __kmp_wait_yield(volatile UT *spinner, UT checker,
-                           kmp_uint32 (*pred)(UT, UT)
-                               USE_ITT_BUILD_ARG(void *obj)) {
-  // note: we may not belong to a team at this point
-  volatile UT *spin = spinner;
-  UT check = checker;
-  kmp_uint32 spins;
-  kmp_uint32 (*f)(UT, UT) = pred;
-  UT r;
-
-  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
-  KMP_INIT_YIELD(spins);
-  // main wait spin loop
-  while (!f(r = *spin, check)) {
-    KMP_FSYNC_SPIN_PREPARE(obj);
-    /* GEH - remove this since it was accidentally introduced when kmp_wait was
-       split. It causes problems with infinite recursion because of exit lock */
-    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
-        __kmp_abort_thread(); */
-
-    // if we are oversubscribed, or have waited a bit (and
-    // KMP_LIBRARY=throughput, then yield. pause is in the following code
-    KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
-    KMP_YIELD_SPIN(spins);
-  }
-  KMP_FSYNC_SPIN_ACQUIRED(obj);
-  return r;
-}
-
-template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
-  return value == checker;
-}
-
-template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
-  return value != checker;
-}
-
-template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
-  return value < checker;
-}
-
-template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
-  return value >= checker;
-}
-
-template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
-  return value <= checker;
-}
-
-/* ------------------------------------------------------------------------ */
-
-static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
-                                     ident_t *loc_ref) {
+void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
   kmp_info_t *th;
 
   KMP_DEBUG_ASSERT(gtid_ref);
@@ -320,85 +65,7 @@
   }
 }
 
-template <typename UT>
-static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
-  typedef typename traits_t<UT>::signed_t ST;
-  dispatch_private_info_template<UT> *pr;
-
-  int gtid = *gtid_ref;
-  //    int  cid = *cid_ref;
-  kmp_info_t *th = __kmp_threads[gtid];
-  KMP_DEBUG_ASSERT(th->th.th_dispatch);
-
-  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
-  if (__kmp_env_consistency_check) {
-    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
-        th->th.th_dispatch->th_dispatch_pr_current);
-    if (pr->pushed_ws != ct_none) {
-#if KMP_USE_DYNAMIC_LOCK
-      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
-#else
-      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
-#endif
-    }
-  }
-
-  if (!th->th.th_team->t.t_serialized) {
-    dispatch_shared_info_template<UT> *sh =
-        reinterpret_cast<dispatch_shared_info_template<UT> *>(
-            th->th.th_dispatch->th_dispatch_sh_current);
-    UT lower;
-
-    if (!__kmp_env_consistency_check) {
-      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
-          th->th.th_dispatch->th_dispatch_pr_current);
-    }
-    lower = pr->u.p.ordered_lower;
-
-#if !defined(KMP_GOMP_COMPAT)
-    if (__kmp_env_consistency_check) {
-      if (pr->ordered_bumped) {
-        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
-        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
-                               ct_ordered_in_pdo, loc_ref,
-                               &p->stack_data[p->w_top]);
-      }
-    }
-#endif /* !defined(KMP_GOMP_COMPAT) */
-
-    KMP_MB();
-#ifdef KMP_DEBUG
-    {
-      char *buff;
-      // create format specifiers before the debug output
-      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
-                              "ordered_iter:%%%s lower:%%%s\n",
-                              traits_t<UT>::spec, traits_t<UT>::spec);
-      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
-      __kmp_str_free(&buff);
-    }
-#endif
-
-    __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
-                         __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
-    KMP_MB(); /* is this necessary? */
-#ifdef KMP_DEBUG
-    {
-      char *buff;
-      // create format specifiers before the debug output
-      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
-                              "ordered_iter:%%%s lower:%%%s\n",
-                              traits_t<UT>::spec, traits_t<UT>::spec);
-      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
-      __kmp_str_free(&buff);
-    }
-#endif
-  }
-  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
-}
-
-static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
-                                     ident_t *loc_ref) {
+void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
   kmp_info_t *th;
 
   if (__kmp_env_consistency_check) {
@@ -409,153 +76,46 @@
   }
 }
 
-template <typename UT>
-static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
-  typedef typename traits_t<UT>::signed_t ST;
-  dispatch_private_info_template<UT> *pr;
-
-  int gtid = *gtid_ref;
-  //    int  cid = *cid_ref;
-  kmp_info_t *th = __kmp_threads[gtid];
-  KMP_DEBUG_ASSERT(th->th.th_dispatch);
-
-  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
-  if (__kmp_env_consistency_check) {
-    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
-        th->th.th_dispatch->th_dispatch_pr_current);
-    if (pr->pushed_ws != ct_none) {
-      __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
-    }
-  }
-
-  if (!th->th.th_team->t.t_serialized) {
-    dispatch_shared_info_template<UT> *sh =
-        reinterpret_cast<dispatch_shared_info_template<UT> *>(
-            th->th.th_dispatch->th_dispatch_sh_current);
-
-    if (!__kmp_env_consistency_check) {
-      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
-          th->th.th_dispatch->th_dispatch_pr_current);
-    }
-
-    KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
-#if !defined(KMP_GOMP_COMPAT)
-    if (__kmp_env_consistency_check) {
-      if (pr->ordered_bumped != 0) {
-        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
-        /* How to test it? - OM */
-        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
-                               ct_ordered_in_pdo, loc_ref,
-                               &p->stack_data[p->w_top]);
-      }
-    }
-#endif /* !defined(KMP_GOMP_COMPAT) */
-
-    KMP_MB(); /* Flush all pending memory write invalidates.  */
-
-    pr->ordered_bumped += 1;
-
-    KD_TRACE(1000,
-             ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
-              gtid, pr->ordered_bumped));
-
-    KMP_MB(); /* Flush all pending memory write invalidates.  */
-
-    /* TODO use general release procedure? */
-    test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
-
-    KMP_MB(); /* Flush all pending memory write invalidates.  */
-  }
-  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
-}
-
-// Computes and returns x to the power of y, where y must a non-negative integer
-template <typename UT>
-static __forceinline long double __kmp_pow(long double x, UT y) {
-  long double s = 1.0L;
-
-  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
-  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
-  while (y) {
-    if (y & 1)
-      s *= x;
-    x *= x;
-    y >>= 1;
-  }
-  return s;
-}
-
-/* Computes and returns the number of unassigned iterations after idx chunks
-   have been assigned (the total number of unassigned iterations in chunks with
-   index greater than or equal to idx). __forceinline seems to be broken so that
-   if we __forceinline this function, the behavior is wrong
-   (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
+// Initialize a dispatch_private_info_template<T> buffer for a particular
+// type of schedule,chunk.  The loop description is found in lb (lower bound),
+// ub (upper bound), and st (stride).  nproc is the number of threads relevant
+// to the scheduling (often the number of threads in a team, but not always if
+// hierarchical scheduling is used).  tid is the id of the thread calling
+// the function within the group of nproc threads.  It will have a value
+// between 0 and nproc - 1.  This is often just the thread id within a team, but
+// is not necessarily the case when using hierarchical scheduling.
+// loc is the source file location of the corresponding loop
+// gtid is the global thread id
 template <typename T>
-static __inline typename traits_t<T>::unsigned_t
-__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
-                                typename traits_t<T>::unsigned_t idx) {
-  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
-     ICL 8.1, long double arithmetic may not really have long double precision,
-     even with /Qlong_double.  Currently, we workaround that in the caller code,
-     by manipulating the FPCW for Windows* OS on IA-32 architecture.  The lack
-     of precision is not expected to be a correctness issue, though. */
+void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
+                                   dispatch_private_info_template<T> *pr,
+                                   enum sched_type schedule, T lb, T ub,
+                                   typename traits_t<T>::signed_t st,
+#if USE_ITT_BUILD
+                                   kmp_uint64 *cur_chunk,
+#endif
+                                   typename traits_t<T>::signed_t chunk,
+                                   T nproc, T tid) {
   typedef typename traits_t<T>::unsigned_t UT;
-
-  long double x = tc * __kmp_pow<UT>(base, idx);
-  UT r = (UT)x;
-  if (x == r)
-    return r;
-  return r + 1;
-}
-
-// Parameters of the guided-iterative algorithm:
-//   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
-//   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
-// by default n = 2. For example with n = 3 the chunks distribution will be more
-// flat.
-// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
-static int guided_int_param = 2;
-static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
-
-// UT - unsigned flavor of T, ST - signed flavor of T,
-// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
-template <typename T>
-static void
-__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
-                    T ub, typename traits_t<T>::signed_t st,
-                    typename traits_t<T>::signed_t chunk, int push_ws) {
-  typedef typename traits_t<T>::unsigned_t UT;
-  typedef typename traits_t<T>::signed_t ST;
   typedef typename traits_t<T>::floating_t DBL;
 
   int active;
   T tc;
   kmp_info_t *th;
   kmp_team_t *team;
-  kmp_uint32 my_buffer_index;
-  dispatch_private_info_template<T> *pr;
-  dispatch_shared_info_template<UT> volatile *sh;
 
-  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
-                   sizeof(dispatch_private_info));
-  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
-                   sizeof(dispatch_shared_info));
-
-  if (!TCR_4(__kmp_init_parallel))
-    __kmp_parallel_initialize();
-
-#if INCLUDE_SSC_MARKS
-  SSC_MARK_DISPATCH_INIT();
-#endif
 #ifdef KMP_DEBUG
+  typedef typename traits_t<T>::signed_t ST;
   {
     char *buff;
     // create format specifiers before the debug output
-    buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
-                            "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
-                            traits_t<ST>::spec, traits_t<T>::spec,
-                            traits_t<T>::spec, traits_t<ST>::spec);
-    KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
+    buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
+                            "pr:%%p lb:%%%s ub:%%%s st:%%%s "
+                            "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
+                            traits_t<T>::spec, traits_t<T>::spec,
+                            traits_t<ST>::spec, traits_t<ST>::spec,
+                            traits_t<T>::spec, traits_t<T>::spec);
+    KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
     __kmp_str_free(&buff);
   }
 #endif
@@ -563,10 +123,8 @@
   th = __kmp_threads[gtid];
   team = th->th.th_team;
   active = !team->t.t_serialized;
-  th->th.th_ident = loc;
 
 #if USE_ITT_BUILD
-  kmp_uint64 cur_chunk = chunk;
   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
                                     __kmp_forkjoin_frames_mode == 3 &&
                                     KMP_MASTER_GTID(gtid) &&
@@ -575,23 +133,6 @@
 #endif
                                     team->t.t_active_level == 1;
 #endif
-  if (!active) {
-    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
-        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
-  } else {
-    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
-                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
-
-    my_buffer_index = th->th.th_dispatch->th_disp_index++;
-
-    /* What happens when number of threads changes, need to resize buffer? */
-    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
-        &th->th.th_dispatch
-             ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
-    sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
-        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
-  }
-
 #if (KMP_STATIC_STEAL_ENABLED)
   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
     // AC: we now have only one implementation of stealing, so use it
@@ -602,19 +143,19 @@
 
   /* Pick up the nomerge/ordered bits from the scheduling type */
   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
-    pr->nomerge = TRUE;
+    pr->flags.nomerge = TRUE;
     schedule =
         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
   } else {
-    pr->nomerge = FALSE;
+    pr->flags.nomerge = FALSE;
   }
   pr->type_size = traits_t<T>::type_size; // remember the size of variables
   if (kmp_ord_lower & schedule) {
-    pr->ordered = TRUE;
+    pr->flags.ordered = TRUE;
     schedule =
         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
   } else {
-    pr->ordered = FALSE;
+    pr->flags.ordered = FALSE;
   }
 
   if (schedule == kmp_sch_static) {
@@ -635,15 +176,16 @@
       // specified)
       chunk = team->t.t_sched.chunk;
 #if USE_ITT_BUILD
-      cur_chunk = chunk;
+      if (cur_chunk)
+        *cur_chunk = chunk;
 #endif
 #ifdef KMP_DEBUG
       {
         char *buff;
         // create format specifiers before the debug output
-        buff = __kmp_str_format(
-            "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
-            traits_t<ST>::spec);
+        buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
+                                "schedule:%%d chunk:%%%s\n",
+                                traits_t<ST>::spec);
         KD_TRACE(10, (buff, gtid, schedule, chunk));
         __kmp_str_free(&buff);
       }
@@ -664,9 +206,10 @@
       {
         char *buff;
         // create format specifiers before the debug output
-        buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
-                                "schedule:%%d chunk:%%%s\n",
-                                traits_t<ST>::spec);
+        buff = __kmp_str_format(
+            "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
+            "schedule:%%d chunk:%%%s\n",
+            traits_t<ST>::spec);
         KD_TRACE(10, (buff, gtid, schedule, chunk));
         __kmp_str_free(&buff);
       }
@@ -674,11 +217,11 @@
     }
 
     /* guided analytical not safe for too many threads */
-    if (schedule == kmp_sch_guided_analytical_chunked &&
-        th->th.th_team_nproc > 1 << 20) {
+    if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
       schedule = kmp_sch_guided_iterative_chunked;
       KMP_WARNING(DispatchManyThreads);
     }
+#if OMP_45_ENABLED
     if (schedule == kmp_sch_runtime_simd) {
       // compiler provides simd_width in the chunk parameter
       schedule = team->t.t_sched.r_sched_type;
@@ -694,7 +237,8 @@
         chunk = team->t.t_sched.chunk * chunk;
       }
 #if USE_ITT_BUILD
-      cur_chunk = chunk;
+      if (cur_chunk)
+        *cur_chunk = chunk;
 #endif
 #ifdef KMP_DEBUG
       {
@@ -708,6 +252,7 @@
       }
 #endif
     }
+#endif // OMP_45_ENABLED
     pr->u.p.parm1 = chunk;
   }
   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
@@ -718,7 +263,7 @@
   if (__kmp_env_consistency_check) {
     if (st == 0) {
       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
-                            (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
+                            (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
     }
   }
   // compute trip count
@@ -746,16 +291,6 @@
     }
   }
 
-  // Any half-decent optimizer will remove this test when the blocks are empty
-  // since the macros expand to nothing when statistics are disabled.
-  if (schedule == __kmp_static) {
-    KMP_COUNT_BLOCK(OMP_FOR_static);
-    KMP_COUNT_VALUE(FOR_static_iterations, tc);
-  } else {
-    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
-    KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
-  }
-
   pr->u.p.lb = lb;
   pr->u.p.ub = ub;
   pr->u.p.st = st;
@@ -768,44 +303,26 @@
   /* NOTE: only the active parallel region(s) has active ordered sections */
 
   if (active) {
-    if (pr->ordered == 0) {
-      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
-      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
-    } else {
+    if (pr->flags.ordered) {
       pr->ordered_bumped = 0;
-
       pr->u.p.ordered_lower = 1;
       pr->u.p.ordered_upper = 0;
-
-      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
-      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
-    }
-  }
-
-  if (__kmp_env_consistency_check) {
-    enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
-    if (push_ws) {
-      __kmp_push_workshare(gtid, ws, loc);
-      pr->pushed_ws = ws;
-    } else {
-      __kmp_check_workshare(gtid, ws, loc);
-      pr->pushed_ws = ct_none;
     }
   }
 
   switch (schedule) {
 #if (KMP_STATIC_STEAL_ENABLED)
   case kmp_sch_static_steal: {
-    T nproc = th->th.th_team_nproc;
     T ntc, init;
 
     KD_TRACE(100,
-             ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
+             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
+              gtid));
 
     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
     if (nproc > 1 && ntc >= nproc) {
-      KMP_COUNT_BLOCK(OMP_FOR_static_steal);
-      T id = __kmp_tid_from_gtid(gtid);
+      KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
+      T id = tid;
       T small_chunk, extras;
 
       small_chunk = ntc / nproc;
@@ -832,7 +349,7 @@
       }
       break;
     } else {
-      KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
+      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
                      "kmp_sch_static_balanced\n",
                      gtid));
       schedule = kmp_sch_static_balanced;
@@ -842,14 +359,15 @@
   } // case
 #endif
   case kmp_sch_static_balanced: {
-    T nproc = th->th.th_team_nproc;
     T init, limit;
 
-    KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
-                   gtid));
+    KD_TRACE(
+        100,
+        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
+         gtid));
 
     if (nproc > 1) {
-      T id = __kmp_tid_from_gtid(gtid);
+      T id = tid;
 
       if (tc < nproc) {
         if (id < tc) {
@@ -873,7 +391,8 @@
         init = 0;
         limit = tc - 1;
         pr->u.p.parm1 = TRUE;
-      } else { // zero trip count
+      } else {
+        // zero trip count
         pr->u.p.count = 1; /* means no more chunks to execute */
         pr->u.p.parm1 = FALSE;
         break;
@@ -882,7 +401,8 @@
 #if USE_ITT_BUILD
     // Calculate chunk for metadata report
     if (itt_need_metadata_reporting)
-      cur_chunk = limit - init + 1;
+      if (cur_chunk)
+        *cur_chunk = limit - init + 1;
 #endif
     if (st == 1) {
       pr->u.p.lb = lb + init;
@@ -899,16 +419,17 @@
         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
       }
     }
-    if (pr->ordered) {
+    if (pr->flags.ordered) {
       pr->u.p.ordered_lower = init;
       pr->u.p.ordered_upper = limit;
     }
     break;
   } // case
+#if OMP_45_ENABLED
   case kmp_sch_static_balanced_chunked: {
     // similar to balanced, but chunk adjusted to multiple of simd width
-    T nth = th->th.th_team_nproc;
-    KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)"
+    T nth = nproc;
+    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
                    " -> falling-through to static_greedy\n",
                    gtid));
     schedule = kmp_sch_static_greedy;
@@ -918,12 +439,14 @@
       pr->u.p.parm1 = tc;
     break;
   } // case
-  case kmp_sch_guided_iterative_chunked:
-  case kmp_sch_guided_simd: {
-    T nproc = th->th.th_team_nproc;
-    KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
-                   " case\n",
-                   gtid));
+  case kmp_sch_guided_simd:
+#endif // OMP_45_ENABLED
+  case kmp_sch_guided_iterative_chunked: {
+    KD_TRACE(
+        100,
+        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
+         " case\n",
+         gtid));
 
     if (nproc > 1) {
       if ((2L * chunk + 1) * nproc >= tc) {
@@ -936,22 +459,24 @@
             guided_flt_param / nproc; // may occupy parm3 and parm4
       }
     } else {
-      KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
+      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
                      "kmp_sch_static_greedy\n",
                      gtid));
       schedule = kmp_sch_static_greedy;
       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
-      KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
-                     gtid));
+      KD_TRACE(
+          100,
+          ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
+           gtid));
       pr->u.p.parm1 = tc;
     } // if
   } // case
   break;
   case kmp_sch_guided_analytical_chunked: {
-    T nproc = th->th.th_team_nproc;
-    KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
-                   " case\n",
+    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
+                   "kmp_sch_guided_analytical_chunked case\n",
                    gtid));
+
     if (nproc > 1) {
       if ((2L * chunk + 1) * nproc >= tc) {
         /* chunk size too large, switch to dynamic */
@@ -1061,7 +586,7 @@
 #endif
       } // if
     } else {
-      KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
+      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
                      "kmp_sch_static_greedy\n",
                      gtid));
       schedule = kmp_sch_static_greedy;
@@ -1071,18 +596,18 @@
   } // case
   break;
   case kmp_sch_static_greedy:
-    KD_TRACE(100,
-             ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
-    pr->u.p.parm1 = (th->th.th_team_nproc > 1)
-                        ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
-                        : tc;
+    KD_TRACE(
+        100,
+        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
+         gtid));
+    pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
     break;
   case kmp_sch_static_chunked:
   case kmp_sch_dynamic_chunked:
     if (pr->u.p.parm1 <= 0) {
       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
     }
-    KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
+    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
                    gtid));
     break;
@@ -1091,12 +616,13 @@
 
     T parm1, parm2, parm3, parm4;
     KD_TRACE(100,
-             ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
+             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
+              gtid));
 
     parm1 = chunk;
 
     /* F : size of the first cycle */
-    parm2 = (tc / (2 * th->th.th_team_nproc));
+    parm2 = (tc / (2 * nproc));
 
     if (parm2 < 1) {
       parm2 = 1;
@@ -1142,6 +668,192 @@
   } break;
   } // switch
   pr->schedule = schedule;
+}
+
+#if KMP_USE_HIER_SCHED
+template <typename T>
+inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
+                                             typename traits_t<T>::signed_t st);
+template <>
+inline void
+__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
+                                            kmp_int32 ub, kmp_int32 st) {
+  __kmp_dispatch_init_hierarchy<kmp_int32>(
+      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
+      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
+}
+template <>
+inline void
+__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
+                                             kmp_uint32 ub, kmp_int32 st) {
+  __kmp_dispatch_init_hierarchy<kmp_uint32>(
+      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
+      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
+}
+template <>
+inline void
+__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
+                                            kmp_int64 ub, kmp_int64 st) {
+  __kmp_dispatch_init_hierarchy<kmp_int64>(
+      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
+      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
+}
+template <>
+inline void
+__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
+                                             kmp_uint64 ub, kmp_int64 st) {
+  __kmp_dispatch_init_hierarchy<kmp_uint64>(
+      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
+      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
+}
+
+// free all the hierarchy scheduling memory associated with the team
+void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
+  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
+  for (int i = 0; i < num_disp_buff; ++i) {
+    // type does not matter here so use kmp_int32
+    auto sh =
+        reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
+            &team->t.t_disp_buffer[i]);
+    if (sh->hier) {
+      sh->hier->deallocate();
+      __kmp_free(sh->hier);
+    }
+  }
+}
+#endif
+
+// UT - unsigned flavor of T, ST - signed flavor of T,
+// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
+template <typename T>
+static void
+__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
+                    T ub, typename traits_t<T>::signed_t st,
+                    typename traits_t<T>::signed_t chunk, int push_ws) {
+  typedef typename traits_t<T>::unsigned_t UT;
+
+  int active;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  kmp_uint32 my_buffer_index;
+  dispatch_private_info_template<T> *pr;
+  dispatch_shared_info_template<T> volatile *sh;
+
+  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
+                   sizeof(dispatch_private_info));
+  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
+                   sizeof(dispatch_shared_info));
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+#if INCLUDE_SSC_MARKS
+  SSC_MARK_DISPATCH_INIT();
+#endif
+#ifdef KMP_DEBUG
+  typedef typename traits_t<T>::signed_t ST;
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
+                            "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
+                            traits_t<ST>::spec, traits_t<T>::spec,
+                            traits_t<T>::spec, traits_t<ST>::spec);
+    KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
+    __kmp_str_free(&buff);
+  }
+#endif
+  /* setup data */
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  active = !team->t.t_serialized;
+  th->th.th_ident = loc;
+
+  // Any half-decent optimizer will remove this test when the blocks are empty
+  // since the macros expand to nothing
+  // when statistics are disabled.
+  if (schedule == __kmp_static) {
+    KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
+  } else {
+    KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
+  }
+
+#if KMP_USE_HIER_SCHED
+  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
+  // Hierarchical scheduling does not work with ordered, so if ordered is
+  // detected, then revert back to threaded scheduling.
+  bool ordered;
+  enum sched_type my_sched = schedule;
+  my_buffer_index = th->th.th_dispatch->th_disp_index;
+  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+      &th->th.th_dispatch
+           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
+  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
+    my_sched =
+        (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
+  ordered = (kmp_ord_lower & my_sched);
+  if (pr->flags.use_hier) {
+    if (ordered) {
+      KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
+                     "Disabling hierarchical scheduling.\n",
+                     gtid));
+      pr->flags.use_hier = FALSE;
+    }
+  }
+  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
+    // Don't use hierarchical for ordered parallel loops and don't
+    // use the runtime hierarchy if one was specified in the program
+    if (!ordered && !pr->flags.use_hier)
+      __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
+  }
+#endif // KMP_USE_HIER_SCHED
+
+#if USE_ITT_BUILD
+  kmp_uint64 cur_chunk = chunk;
+  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
+                                    __kmp_forkjoin_frames_mode == 3 &&
+                                    KMP_MASTER_GTID(gtid) &&
+#if OMP_40_ENABLED
+                                    th->th.th_teams_microtask == NULL &&
+#endif
+                                    team->t.t_active_level == 1;
+#endif
+  if (!active) {
+    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
+  } else {
+    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+    my_buffer_index = th->th.th_dispatch->th_disp_index++;
+
+    /* What happens when number of threads changes, need to resize buffer? */
+    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+        &th->th.th_dispatch
+             ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
+        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+    KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
+                  my_buffer_index));
+  }
+
+  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
+#if USE_ITT_BUILD
+                                &cur_chunk,
+#endif
+                                chunk, (T)th->th.th_team_nproc,
+                                (T)th->th.th_info.ds.ds_tid);
+  if (active) {
+    if (pr->flags.ordered == 0) {
+      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
+      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
+    } else {
+      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
+      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
+    }
+  }
+
   if (active) {
     /* The name of this buffer should be my_buffer_index when it's free to use
      * it */
@@ -1162,7 +874,7 @@
     th->th.th_dispatch->th_dispatch_sh_current =
         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
 #if USE_ITT_BUILD
-    if (pr->ordered) {
+    if (pr->flags.ordered) {
       __kmp_itt_ordered_init(gtid);
     }
     // Report loop metadata
@@ -1181,7 +893,9 @@
         break;
       case kmp_sch_guided_iterative_chunked:
       case kmp_sch_guided_analytical_chunked:
+#if OMP_45_ENABLED
       case kmp_sch_guided_simd:
+#endif
         schedtype = 2;
         break;
       default:
@@ -1190,8 +904,14 @@
         schedtype = 3;
         break;
       }
-      __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
+      __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
     }
+#if KMP_USE_HIER_SCHED
+    if (pr->flags.use_hier) {
+      pr->u.p.count = 0;
+      pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
+    }
+#endif // KMP_USER_HIER_SCHED
 #endif /* USE_ITT_BUILD */
   }
 
@@ -1208,10 +928,10 @@
         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
-    KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
-                  pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
-                  pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
-                  pr->u.p.parm3, pr->u.p.parm4));
+    KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
+                  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
+                  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
+                  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
     __kmp_str_free(&buff);
   }
 #endif
@@ -1234,12 +954,12 @@
   if (ompt_enabled.ompt_callback_work) {
     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
-    kmp_info_t *thr = __kmp_threads[gtid];
     ompt_callbacks.ompt_callback(ompt_callback_work)(
         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
-        &(task_info->task_data), tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
+        &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
   }
 #endif
+  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
 }
 
 /* For ordered loops, either __kmp_dispatch_finish() should be called after
@@ -1389,6 +1109,700 @@
 
 #endif /* KMP_GOMP_COMPAT */
 
+template <typename T>
+int __kmp_dispatch_next_algorithm(int gtid,
+                                  dispatch_private_info_template<T> *pr,
+                                  dispatch_shared_info_template<T> volatile *sh,
+                                  kmp_int32 *p_last, T *p_lb, T *p_ub,
+                                  typename traits_t<T>::signed_t *p_st, T nproc,
+                                  T tid) {
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  typedef typename traits_t<T>::floating_t DBL;
+  int status = 0;
+  kmp_int32 last = 0;
+  T start;
+  ST incr;
+  UT limit, trip, init;
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_team_t *team = th->th.th_team;
+
+  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+  KMP_DEBUG_ASSERT(pr);
+  KMP_DEBUG_ASSERT(sh);
+  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff =
+        __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
+                         "sh:%%p nproc:%%%s tid:%%%s\n",
+                         traits_t<T>::spec, traits_t<T>::spec);
+    KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
+    __kmp_str_free(&buff);
+  }
+#endif
+
+  // zero trip count
+  if (pr->u.p.tc == 0) {
+    KD_TRACE(10,
+             ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
+              "zero status:%d\n",
+              gtid, status));
+    return 0;
+  }
+
+  switch (pr->schedule) {
+#if (KMP_STATIC_STEAL_ENABLED)
+  case kmp_sch_static_steal: {
+    T chunk = pr->u.p.parm1;
+
+    KD_TRACE(100,
+             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
+              gtid));
+
+    trip = pr->u.p.tc - 1;
+
+    if (traits_t<T>::type_size > 4) {
+      // use lock for 8-byte and CAS for 4-byte induction
+      // variable. TODO (optional): check and use 16-byte CAS
+      kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
+      KMP_DEBUG_ASSERT(lck != NULL);
+      if (pr->u.p.count < (UT)pr->u.p.ub) {
+        __kmp_acquire_lock(lck, gtid);
+        // try to get own chunk of iterations
+        init = (pr->u.p.count)++;
+        status = (init < (UT)pr->u.p.ub);
+        __kmp_release_lock(lck, gtid);
+      } else {
+        status = 0; // no own chunks
+      }
+      if (!status) { // try to steal
+        kmp_info_t **other_threads = team->t.t_threads;
+        int while_limit = nproc; // nproc attempts to find a victim
+        int while_index = 0;
+        // TODO: algorithm of searching for a victim
+        // should be cleaned up and measured
+        while ((!status) && (while_limit != ++while_index)) {
+          T remaining;
+          T victimIdx = pr->u.p.parm4;
+          T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
+          dispatch_private_info_template<T> *victim =
+              reinterpret_cast<dispatch_private_info_template<T> *>(
+                  other_threads[victimIdx]
+                      ->th.th_dispatch->th_dispatch_pr_current);
+          while ((victim == NULL || victim == pr ||
+                  (*(volatile T *)&victim->u.p.static_steal_counter !=
+                   *(volatile T *)&pr->u.p.static_steal_counter)) &&
+                 oldVictimIdx != victimIdx) {
+            victimIdx = (victimIdx + 1) % nproc;
+            victim = reinterpret_cast<dispatch_private_info_template<T> *>(
+                other_threads[victimIdx]
+                    ->th.th_dispatch->th_dispatch_pr_current);
+          }
+          if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
+                          *(volatile T *)&pr->u.p.static_steal_counter)) {
+            continue; // try once more (nproc attempts in total)
+            // no victim is ready yet to participate in stealing
+            // because all victims are still in kmp_init_dispatch
+          }
+          if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
+            pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
+            continue; // not enough chunks to steal, goto next victim
+          }
+
+          lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
+          KMP_ASSERT(lck != NULL);
+          __kmp_acquire_lock(lck, gtid);
+          limit = victim->u.p.ub; // keep initial ub
+          if (victim->u.p.count >= limit ||
+              (remaining = limit - victim->u.p.count) < 2) {
+            __kmp_release_lock(lck, gtid);
+            pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
+            continue; // not enough chunks to steal
+          }
+          // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
+          // by 1
+          if (remaining > 3) {
+            // steal 1/4 of remaining
+            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
+            init = (victim->u.p.ub -= (remaining >> 2));
+          } else {
+            // steal 1 chunk of 2 or 3 remaining
+            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
+            init = (victim->u.p.ub -= 1);
+          }
+          __kmp_release_lock(lck, gtid);
+
+          KMP_DEBUG_ASSERT(init + 1 <= limit);
+          pr->u.p.parm4 = victimIdx; // remember victim to steal from
+          status = 1;
+          while_index = 0;
+          // now update own count and ub with stolen range but init chunk
+          __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
+          pr->u.p.count = init + 1;
+          pr->u.p.ub = limit;
+          __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
+        } // while (search for victim)
+      } // if (try to find victim and steal)
+    } else {
+      // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
+      typedef union {
+        struct {
+          UT count;
+          T ub;
+        } p;
+        kmp_int64 b;
+      } union_i4;
+      // All operations on 'count' or 'ub' must be combined atomically
+      // together.
+      {
+        union_i4 vold, vnew;
+        vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
+        vnew = vold;
+        vnew.p.count++;
+        while (!KMP_COMPARE_AND_STORE_ACQ64(
+            (volatile kmp_int64 *)&pr->u.p.count,
+            *VOLATILE_CAST(kmp_int64 *) & vold.b,
+            *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
+          KMP_CPU_PAUSE();
+          vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
+          vnew = vold;
+          vnew.p.count++;
+        }
+        vnew = vold;
+        init = vnew.p.count;
+        status = (init < (UT)vnew.p.ub);
+      }
+
+      if (!status) {
+        kmp_info_t **other_threads = team->t.t_threads;
+        int while_limit = nproc; // nproc attempts to find a victim
+        int while_index = 0;
+
+        // TODO: algorithm of searching for a victim
+        // should be cleaned up and measured
+        while ((!status) && (while_limit != ++while_index)) {
+          union_i4 vold, vnew;
+          kmp_int32 remaining;
+          T victimIdx = pr->u.p.parm4;
+          T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
+          dispatch_private_info_template<T> *victim =
+              reinterpret_cast<dispatch_private_info_template<T> *>(
+                  other_threads[victimIdx]
+                      ->th.th_dispatch->th_dispatch_pr_current);
+          while ((victim == NULL || victim == pr ||
+                  (*(volatile T *)&victim->u.p.static_steal_counter !=
+                   *(volatile T *)&pr->u.p.static_steal_counter)) &&
+                 oldVictimIdx != victimIdx) {
+            victimIdx = (victimIdx + 1) % nproc;
+            victim = reinterpret_cast<dispatch_private_info_template<T> *>(
+                other_threads[victimIdx]
+                    ->th.th_dispatch->th_dispatch_pr_current);
+          }
+          if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
+                          *(volatile T *)&pr->u.p.static_steal_counter)) {
+            continue; // try once more (nproc attempts in total)
+            // no victim is ready yet to participate in stealing
+            // because all victims are still in kmp_init_dispatch
+          }
+          pr->u.p.parm4 = victimIdx; // new victim found
+          while (1) { // CAS loop if victim has enough chunks to steal
+            vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
+            vnew = vold;
+
+            KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
+            if (vnew.p.count >= (UT)vnew.p.ub ||
+                (remaining = vnew.p.ub - vnew.p.count) < 2) {
+              pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
+              break; // not enough chunks to steal, goto next victim
+            }
+            if (remaining > 3) {
+              vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
+            } else {
+              vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
+            }
+            KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
+            // TODO: Should this be acquire or release?
+            if (KMP_COMPARE_AND_STORE_ACQ64(
+                    (volatile kmp_int64 *)&victim->u.p.count,
+                    *VOLATILE_CAST(kmp_int64 *) & vold.b,
+                    *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
+              // stealing succedded
+              KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
+                                        vold.p.ub - vnew.p.ub);
+              status = 1;
+              while_index = 0;
+              // now update own count and ub
+              init = vnew.p.ub;
+              vold.p.count = init + 1;
+#if KMP_ARCH_X86
+              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
+#else
+              *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
+#endif
+              break;
+            } // if (check CAS result)
+            KMP_CPU_PAUSE(); // CAS failed, repeate attempt
+          } // while (try to steal from particular victim)
+        } // while (search for victim)
+      } // if (try to find victim and steal)
+    } // if (4-byte induction variable)
+    if (!status) {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } else {
+      start = pr->u.p.parm2;
+      init *= chunk;
+      limit = chunk + init - 1;
+      incr = pr->u.p.st;
+      KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
+
+      KMP_DEBUG_ASSERT(init <= trip);
+      if ((last = (limit >= trip)) != 0)
+        limit = trip;
+      if (p_st != NULL)
+        *p_st = incr;
+
+      if (incr == 1) {
+        *p_lb = start + init;
+        *p_ub = start + limit;
+      } else {
+        *p_lb = start + init * incr;
+        *p_ub = start + limit * incr;
+      }
+
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } // if
+    break;
+  } // case
+#endif // ( KMP_STATIC_STEAL_ENABLED )
+  case kmp_sch_static_balanced: {
+    KD_TRACE(
+        10,
+        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
+         gtid));
+    /* check if thread has any iteration to do */
+    if ((status = !pr->u.p.count) != 0) {
+      pr->u.p.count = 1;
+      *p_lb = pr->u.p.lb;
+      *p_ub = pr->u.p.ub;
+      last = pr->u.p.parm1;
+      if (p_st != NULL)
+        *p_st = pr->u.p.st;
+    } else { /* no iterations to do */
+      pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
+    }
+  } // case
+  break;
+  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
+                                 merged here */
+  case kmp_sch_static_chunked: {
+    T parm1;
+
+    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
+                   "kmp_sch_static_[affinity|chunked] case\n",
+                   gtid));
+    parm1 = pr->u.p.parm1;
+
+    trip = pr->u.p.tc - 1;
+    init = parm1 * (pr->u.p.count + tid);
+
+    if ((status = (init <= trip)) != 0) {
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+      limit = parm1 + init - 1;
+
+      if ((last = (limit >= trip)) != 0)
+        limit = trip;
+
+      if (p_st != NULL)
+        *p_st = incr;
+
+      pr->u.p.count += nproc;
+
+      if (incr == 1) {
+        *p_lb = start + init;
+        *p_ub = start + limit;
+      } else {
+        *p_lb = start + init * incr;
+        *p_ub = start + limit * incr;
+      }
+
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } // if
+  } // case
+  break;
+
+  case kmp_sch_dynamic_chunked: {
+    T chunk = pr->u.p.parm1;
+
+    KD_TRACE(
+        100,
+        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
+         gtid));
+
+    init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
+    trip = pr->u.p.tc - 1;
+
+    if ((status = (init <= trip)) == 0) {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } else {
+      start = pr->u.p.lb;
+      limit = chunk + init - 1;
+      incr = pr->u.p.st;
+
+      if ((last = (limit >= trip)) != 0)
+        limit = trip;
+
+      if (p_st != NULL)
+        *p_st = incr;
+
+      if (incr == 1) {
+        *p_lb = start + init;
+        *p_ub = start + limit;
+      } else {
+        *p_lb = start + init * incr;
+        *p_ub = start + limit * incr;
+      }
+
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } // if
+  } // case
+  break;
+
+  case kmp_sch_guided_iterative_chunked: {
+    T chunkspec = pr->u.p.parm1;
+    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
+                   "iterative case\n",
+                   gtid));
+    trip = pr->u.p.tc;
+    // Start atomic part of calculations
+    while (1) {
+      ST remaining; // signed, because can be < 0
+      init = sh->u.s.iteration; // shared value
+      remaining = trip - init;
+      if (remaining <= 0) { // AC: need to compare with 0 first
+        // nothing to do, don't try atomic op
+        status = 0;
+        break;
+      }
+      if ((T)remaining <
+          pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
+        // use dynamic-style shcedule
+        // atomically inrement iterations, get old value
+        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+                                 (ST)chunkspec);
+        remaining = trip - init;
+        if (remaining <= 0) {
+          status = 0; // all iterations got by other threads
+        } else {
+          // got some iterations to work on
+          status = 1;
+          if ((T)remaining > chunkspec) {
+            limit = init + chunkspec - 1;
+          } else {
+            last = 1; // the last chunk
+            limit = init + remaining - 1;
+          } // if
+        } // if
+        break;
+      } // if
+      limit = init +
+              (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
+      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+                               (ST)init, (ST)limit)) {
+        // CAS was successful, chunk obtained
+        status = 1;
+        --limit;
+        break;
+      } // if
+    } // while
+    if (status != 0) {
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+      if (p_st != NULL)
+        *p_st = incr;
+      *p_lb = start + init * incr;
+      *p_ub = start + limit * incr;
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } else {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } // if
+  } // case
+  break;
+
+#if OMP_45_ENABLED
+  case kmp_sch_guided_simd: {
+    // same as iterative but curr-chunk adjusted to be multiple of given
+    // chunk
+    T chunk = pr->u.p.parm1;
+    KD_TRACE(100,
+             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
+              gtid));
+    trip = pr->u.p.tc;
+    // Start atomic part of calculations
+    while (1) {
+      ST remaining; // signed, because can be < 0
+      init = sh->u.s.iteration; // shared value
+      remaining = trip - init;
+      if (remaining <= 0) { // AC: need to compare with 0 first
+        status = 0; // nothing to do, don't try atomic op
+        break;
+      }
+      KMP_DEBUG_ASSERT(init % chunk == 0);
+      // compare with K*nproc*(chunk+1), K=2 by default
+      if ((T)remaining < pr->u.p.parm2) {
+        // use dynamic-style shcedule
+        // atomically inrement iterations, get old value
+        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+                                 (ST)chunk);
+        remaining = trip - init;
+        if (remaining <= 0) {
+          status = 0; // all iterations got by other threads
+        } else {
+          // got some iterations to work on
+          status = 1;
+          if ((T)remaining > chunk) {
+            limit = init + chunk - 1;
+          } else {
+            last = 1; // the last chunk
+            limit = init + remaining - 1;
+          } // if
+        } // if
+        break;
+      } // if
+      // divide by K*nproc
+      UT span = remaining * (*(double *)&pr->u.p.parm3);
+      UT rem = span % chunk;
+      if (rem) // adjust so that span%chunk == 0
+        span += chunk - rem;
+      limit = init + span;
+      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+                               (ST)init, (ST)limit)) {
+        // CAS was successful, chunk obtained
+        status = 1;
+        --limit;
+        break;
+      } // if
+    } // while
+    if (status != 0) {
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+      if (p_st != NULL)
+        *p_st = incr;
+      *p_lb = start + init * incr;
+      *p_ub = start + limit * incr;
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } else {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } // if
+  } // case
+  break;
+#endif // OMP_45_ENABLED
+
+  case kmp_sch_guided_analytical_chunked: {
+    T chunkspec = pr->u.p.parm1;
+    UT chunkIdx;
+#if KMP_OS_WINDOWS && KMP_ARCH_X86
+    /* for storing original FPCW value for Windows* OS on
+       IA-32 architecture 8-byte version */
+    unsigned int oldFpcw;
+    unsigned int fpcwSet = 0;
+#endif
+    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
+                   "kmp_sch_guided_analytical_chunked case\n",
+                   gtid));
+
+    trip = pr->u.p.tc;
+
+    KMP_DEBUG_ASSERT(nproc > 1);
+    KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
+
+    while (1) { /* this while loop is a safeguard against unexpected zero
+                   chunk sizes */
+      chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
+      if (chunkIdx >= (UT)pr->u.p.parm2) {
+        --trip;
+        /* use dynamic-style scheduling */
+        init = chunkIdx * chunkspec + pr->u.p.count;
+        /* need to verify init > 0 in case of overflow in the above
+         * calculation */
+        if ((status = (init > 0 && init <= trip)) != 0) {
+          limit = init + chunkspec - 1;
+
+          if ((last = (limit >= trip)) != 0)
+            limit = trip;
+        }
+        break;
+      } else {
+/* use exponential-style scheduling */
+/* The following check is to workaround the lack of long double precision on
+   Windows* OS.
+   This check works around the possible effect that init != 0 for chunkIdx == 0.
+ */
+#if KMP_OS_WINDOWS && KMP_ARCH_X86
+        /* If we haven't already done so, save original
+           FPCW and set precision to 64-bit, as Windows* OS
+           on IA-32 architecture defaults to 53-bit */
+        if (!fpcwSet) {
+          oldFpcw = _control87(0, 0);
+          _control87(_PC_64, _MCW_PC);
+          fpcwSet = 0x30000;
+        }
+#endif
+        if (chunkIdx) {
+          init = __kmp_dispatch_guided_remaining<T>(
+              trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
+          KMP_DEBUG_ASSERT(init);
+          init = trip - init;
+        } else
+          init = 0;
+        limit = trip - __kmp_dispatch_guided_remaining<T>(
+                           trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
+        KMP_ASSERT(init <= limit);
+        if (init < limit) {
+          KMP_DEBUG_ASSERT(limit <= trip);
+          --limit;
+          status = 1;
+          break;
+        } // if
+      } // if
+    } // while (1)
+#if KMP_OS_WINDOWS && KMP_ARCH_X86
+    /* restore FPCW if necessary
+       AC: check fpcwSet flag first because oldFpcw can be uninitialized here
+    */
+    if (fpcwSet && (oldFpcw & fpcwSet))
+      _control87(oldFpcw, _MCW_PC);
+#endif
+    if (status != 0) {
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+      if (p_st != NULL)
+        *p_st = incr;
+      *p_lb = start + init * incr;
+      *p_ub = start + limit * incr;
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      }
+    } else {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    }
+  } // case
+  break;
+
+  case kmp_sch_trapezoidal: {
+    UT index;
+    T parm2 = pr->u.p.parm2;
+    T parm3 = pr->u.p.parm3;
+    T parm4 = pr->u.p.parm4;
+    KD_TRACE(100,
+             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
+              gtid));
+
+    index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
+
+    init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
+    trip = pr->u.p.tc - 1;
+
+    if ((status = ((T)index < parm3 && init <= trip)) == 0) {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } else {
+      start = pr->u.p.lb;
+      limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
+      incr = pr->u.p.st;
+
+      if ((last = (limit >= trip)) != 0)
+        limit = trip;
+
+      if (p_st != NULL)
+        *p_st = incr;
+
+      if (incr == 1) {
+        *p_lb = start + init;
+        *p_ub = start + limit;
+      } else {
+        *p_lb = start + init * incr;
+        *p_ub = start + limit * incr;
+      }
+
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } // if
+  } // case
+  break;
+  default: {
+    status = 0; // to avoid complaints on uninitialized variable use
+    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
+                KMP_HNT(GetNewerLibrary), // Hint
+                __kmp_msg_null // Variadic argument list terminator
+                );
+  } break;
+  } // switch
+  if (p_last)
+    *p_last = last;
+#ifdef KMP_DEBUG
+  if (pr->flags.ordered) {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
+                            "ordered_lower:%%%s ordered_upper:%%%s\n",
+                            traits_t<UT>::spec, traits_t<UT>::spec);
+    KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
+    __kmp_str_free(&buff);
+  }
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format(
+        "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
+        "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
+        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
+    KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
+    __kmp_str_free(&buff);
+  }
+#endif
+  return status;
+}
+
 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
    is not called. */
@@ -1408,6 +1822,38 @@
 #define OMPT_LOOP_END // no-op
 #endif
 
+#if KMP_STATS_ENABLED
+#define KMP_STATS_LOOP_END                                                     \
+  {                                                                            \
+    kmp_int64 u, l, t, i;                                                      \
+    l = (kmp_int64)(*p_lb);                                                    \
+    u = (kmp_int64)(*p_ub);                                                    \
+    i = (kmp_int64)(pr->u.p.st);                                               \
+    if (status == 0) {                                                         \
+      t = 0;                                                                   \
+      KMP_POP_PARTITIONED_TIMER();                                             \
+    } else if (i == 1) {                                                       \
+      if (u >= l)                                                              \
+        t = u - l + 1;                                                         \
+      else                                                                     \
+        t = 0;                                                                 \
+    } else if (i < 0) {                                                        \
+      if (l >= u)                                                              \
+        t = (l - u) / (-i) + 1;                                                \
+      else                                                                     \
+        t = 0;                                                                 \
+    } else {                                                                   \
+      if (u >= l)                                                              \
+        t = (u - l) / i + 1;                                                   \
+      else                                                                     \
+        t = 0;                                                                 \
+    }                                                                          \
+    KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
+  }
+#else
+#define KMP_STATS_LOOP_END /* Nothing */
+#endif
+
 template <typename T>
 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
                                T *p_lb, T *p_ub,
@@ -1420,13 +1866,11 @@
 
   typedef typename traits_t<T>::unsigned_t UT;
   typedef typename traits_t<T>::signed_t ST;
-  typedef typename traits_t<T>::floating_t DBL;
-
   // This is potentially slightly misleading, schedule(runtime) will appear here
   // even if the actual runtme schedule is static. (Which points out a
   // disadavantage of schedule(runtime): even when static scheduling is used it
   // costs more than a compile time choice to use static scheduling would.)
-  KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
+  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
 
   int status;
   dispatch_private_info_template<T> *pr;
@@ -1434,18 +1878,10 @@
   kmp_team_t *team = th->th.th_team;
 
   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
-#ifdef KMP_DEBUG
-  {
-    char *buff;
-    // create format specifiers before the debug output
-    buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
-                            "p_ub:%%%s p_st:%%%s p_last: %%p\n",
-                            traits_t<T>::spec, traits_t<T>::spec,
-                            traits_t<ST>::spec);
-    KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
-    __kmp_str_free(&buff);
-  }
-#endif
+  KD_TRACE(
+      1000,
+      ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
+       gtid, p_lb, p_ub, p_st, p_last));
 
   if (team->t.t_serialized) {
     /* NOTE: serialize this dispatch becase we are not at the active level */
@@ -1465,7 +1901,7 @@
           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
         }
       }
-    } else if (pr->nomerge) {
+    } else if (pr->flags.nomerge) {
       kmp_int32 last;
       T start;
       UT limit, trip, init;
@@ -1513,7 +1949,7 @@
           *p_ub = start + limit * incr;
         }
 
-        if (pr->ordered) {
+        if (pr->flags.ordered) {
           pr->u.p.ordered_lower = init;
           pr->u.p.ordered_upper = limit;
 #ifdef KMP_DEBUG
@@ -1558,13 +1994,11 @@
     SSC_MARK_DISPATCH_NEXT();
 #endif
     OMPT_LOOP_END;
+    KMP_STATS_LOOP_END;
     return status;
   } else {
     kmp_int32 last = 0;
-    dispatch_shared_info_template<UT> *sh;
-    T start;
-    ST incr;
-    UT limit, trip, init;
+    dispatch_shared_info_template<T> volatile *sh;
 
     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
@@ -1572,735 +2006,19 @@
     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
         th->th.th_dispatch->th_dispatch_pr_current);
     KMP_DEBUG_ASSERT(pr);
-    sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
+    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
         th->th.th_dispatch->th_dispatch_sh_current);
     KMP_DEBUG_ASSERT(sh);
 
-    if (pr->u.p.tc == 0) {
-      // zero trip count
-      status = 0;
-    } else {
-      switch (pr->schedule) {
-#if (KMP_STATIC_STEAL_ENABLED)
-      case kmp_sch_static_steal: {
-        T chunk = pr->u.p.parm1;
-        int nproc = th->th.th_team_nproc;
-
-        KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
-                       gtid));
-
-        trip = pr->u.p.tc - 1;
-
-        if (traits_t<T>::type_size > 4) {
-          // use lock for 8-byte and CAS for 4-byte induction
-          // variable. TODO (optional): check and use 16-byte CAS
-          kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
-          KMP_DEBUG_ASSERT(lck != NULL);
-          if (pr->u.p.count < (UT)pr->u.p.ub) {
-            __kmp_acquire_lock(lck, gtid);
-            // try to get own chunk of iterations
-            init = (pr->u.p.count)++;
-            status = (init < (UT)pr->u.p.ub);
-            __kmp_release_lock(lck, gtid);
-          } else {
-            status = 0; // no own chunks
-          }
-          if (!status) { // try to steal
-            kmp_info_t **other_threads = team->t.t_threads;
-            int while_limit = nproc; // nproc attempts to find a victim
-            int while_index = 0;
-            // TODO: algorithm of searching for a victim
-            // should be cleaned up and measured
-            while ((!status) && (while_limit != ++while_index)) {
-              T remaining;
-              T victimIdx = pr->u.p.parm4;
-              T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
-              dispatch_private_info_template<T> *victim =
-                  reinterpret_cast<dispatch_private_info_template<T> *>(
-                      other_threads[victimIdx]
-                          ->th.th_dispatch->th_dispatch_pr_current);
-              while ((victim == NULL || victim == pr ||
-                      (*(volatile T *)&victim->u.p.static_steal_counter !=
-                       *(volatile T *)&pr->u.p.static_steal_counter)) &&
-                     oldVictimIdx != victimIdx) {
-                victimIdx = (victimIdx + 1) % nproc;
-                victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-                    other_threads[victimIdx]
-                        ->th.th_dispatch->th_dispatch_pr_current);
-              }
-              if (!victim ||
-                  (*(volatile T *)&victim->u.p.static_steal_counter !=
-                   *(volatile T *)&pr->u.p.static_steal_counter)) {
-                continue; // try once more (nproc attempts in total)
-                // no victim is ready yet to participate in stealing
-                // because all victims are still in kmp_init_dispatch
-              }
-              if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
-                pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
-                continue; // not enough chunks to steal, goto next victim
-              }
-
-              lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
-              KMP_ASSERT(lck != NULL);
-              __kmp_acquire_lock(lck, gtid);
-              limit = victim->u.p.ub; // keep initial ub
-              if (victim->u.p.count >= limit ||
-                  (remaining = limit - victim->u.p.count) < 2) {
-                __kmp_release_lock(lck, gtid);
-                pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
-                continue; // not enough chunks to steal
-              }
-              // stealing succeded, reduce victim's ub by 1/4 of undone chunks
-              // or by 1
-              if (remaining > 3) {
-                KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
-                init = (victim->u.p.ub -=
-                        (remaining >> 2)); // steal 1/4 of remaining
-              } else {
-                KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
-                init =
-                    (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
-              }
-              __kmp_release_lock(lck, gtid);
-
-              KMP_DEBUG_ASSERT(init + 1 <= limit);
-              pr->u.p.parm4 = victimIdx; // remember victim to steal from
-              status = 1;
-              while_index = 0;
-              // now update own count and ub with stolen range but init chunk
-              __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
-              pr->u.p.count = init + 1;
-              pr->u.p.ub = limit;
-              __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
-            } // while (search for victim)
-          } // if (try to find victim and steal)
-        } else {
-          // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
-          typedef union {
-            struct {
-              UT count;
-              T ub;
-            } p;
-            kmp_int64 b;
-          } union_i4;
-          // All operations on 'count' or 'ub' must be combined atomically
-          // together.
-          {
-            union_i4 vold, vnew;
-            vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
-            vnew = vold;
-            vnew.p.count++;
-            while (!KMP_COMPARE_AND_STORE_ACQ64(
-                (volatile kmp_int64 *)&pr->u.p.count,
-                *VOLATILE_CAST(kmp_int64 *) & vold.b,
-                *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
-              KMP_CPU_PAUSE();
-              vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
-              vnew = vold;
-              vnew.p.count++;
-            }
-            vnew = vold;
-            init = vnew.p.count;
-            status = (init < (UT)vnew.p.ub);
-          }
-
-          if (!status) {
-            kmp_info_t **other_threads = team->t.t_threads;
-            int while_limit = nproc; // nproc attempts to find a victim
-            int while_index = 0;
-
-            // TODO: algorithm of searching for a victim
-            // should be cleaned up and measured
-            while ((!status) && (while_limit != ++while_index)) {
-              union_i4 vold, vnew;
-              kmp_int32 remaining;
-              T victimIdx = pr->u.p.parm4;
-              T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
-              dispatch_private_info_template<T> *victim =
-                  reinterpret_cast<dispatch_private_info_template<T> *>(
-                      other_threads[victimIdx]
-                          ->th.th_dispatch->th_dispatch_pr_current);
-              while ((victim == NULL || victim == pr ||
-                      (*(volatile T *)&victim->u.p.static_steal_counter !=
-                       *(volatile T *)&pr->u.p.static_steal_counter)) &&
-                     oldVictimIdx != victimIdx) {
-                victimIdx = (victimIdx + 1) % nproc;
-                victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-                    other_threads[victimIdx]
-                        ->th.th_dispatch->th_dispatch_pr_current);
-              }
-              if (!victim ||
-                  (*(volatile T *)&victim->u.p.static_steal_counter !=
-                   *(volatile T *)&pr->u.p.static_steal_counter)) {
-                continue; // try once more (nproc attempts in total)
-                // no victim is ready yet to participate in stealing
-                // because all victims are still in kmp_init_dispatch
-              }
-              pr->u.p.parm4 = victimIdx; // new victim found
-              while (1) { // CAS loop if victim has enough chunks to steal
-                vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
-                vnew = vold;
-
-                KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
-                if (vnew.p.count >= (UT)vnew.p.ub ||
-                    (remaining = vnew.p.ub - vnew.p.count) < 2) {
-                  pr->u.p.parm4 =
-                      (victimIdx + 1) % nproc; // shift start victim id
-                  break; // not enough chunks to steal, goto next victim
-                }
-                if (remaining > 3) {
-                  vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
-                } else {
-                  vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
-                }
-                KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
-                // TODO: Should this be acquire or release?
-                if (KMP_COMPARE_AND_STORE_ACQ64(
-                        (volatile kmp_int64 *)&victim->u.p.count,
-                        *VOLATILE_CAST(kmp_int64 *) & vold.b,
-                        *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
-                  // stealing succeeded
-                  KMP_COUNT_VALUE(FOR_static_steal_stolen,
-                                  vold.p.ub - vnew.p.ub);
-                  status = 1;
-                  while_index = 0;
-                  // now update own count and ub
-                  init = vnew.p.ub;
-                  vold.p.count = init + 1;
-#if KMP_ARCH_X86
-                  KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
-                                   vold.b);
-#else
-                  *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
-#endif
-                  break;
-                } // if (check CAS result)
-                KMP_CPU_PAUSE(); // CAS failed, repeat attempt
-              } // while (try to steal from particular victim)
-            } // while (search for victim)
-          } // if (try to find victim and steal)
-        } // if (4-byte induction variable)
-        if (!status) {
-          *p_lb = 0;
-          *p_ub = 0;
-          if (p_st != NULL)
-            *p_st = 0;
-        } else {
-          start = pr->u.p.parm2;
-          init *= chunk;
-          limit = chunk + init - 1;
-          incr = pr->u.p.st;
-          KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
-
-          KMP_DEBUG_ASSERT(init <= trip);
-          if ((last = (limit >= trip)) != 0)
-            limit = trip;
-          if (p_st != NULL)
-            *p_st = incr;
-
-          if (incr == 1) {
-            *p_lb = start + init;
-            *p_ub = start + limit;
-          } else {
-            *p_lb = start + init * incr;
-            *p_ub = start + limit * incr;
-          }
-
-          if (pr->ordered) {
-            pr->u.p.ordered_lower = init;
-            pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
-            {
-              char *buff;
-              // create format specifiers before the debug output
-              buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
-                                      "ordered_lower:%%%s ordered_upper:%%%s\n",
-                                      traits_t<UT>::spec, traits_t<UT>::spec);
-              KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
-                              pr->u.p.ordered_upper));
-              __kmp_str_free(&buff);
-            }
-#endif
-          } // if
-        } // if
-        break;
-      } // case
-#endif // ( KMP_STATIC_STEAL_ENABLED )
-      case kmp_sch_static_balanced: {
-        KD_TRACE(
-            100,
-            ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
-        if ((status = !pr->u.p.count) !=
-            0) { /* check if thread has any iteration to do */
-          pr->u.p.count = 1;
-          *p_lb = pr->u.p.lb;
-          *p_ub = pr->u.p.ub;
-          last = pr->u.p.parm1;
-          if (p_st != NULL)
-            *p_st = pr->u.p.st;
-        } else { /* no iterations to do */
-          pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
-        }
-        if (pr->ordered) {
-#ifdef KMP_DEBUG
-          {
-            char *buff;
-            // create format specifiers before the debug output
-            buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
-                                    "ordered_lower:%%%s ordered_upper:%%%s\n",
-                                    traits_t<UT>::spec, traits_t<UT>::spec);
-            KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
-                            pr->u.p.ordered_upper));
-            __kmp_str_free(&buff);
-          }
-#endif
-        } // if
-      } // case
-      break;
-      case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
-                                     merged here */
-      case kmp_sch_static_chunked: {
-        T parm1;
-
-        KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
-                       "kmp_sch_static_[affinity|chunked] case\n",
-                       gtid));
-        parm1 = pr->u.p.parm1;
-
-        trip = pr->u.p.tc - 1;
-        init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
-
-        if ((status = (init <= trip)) != 0) {
-          start = pr->u.p.lb;
-          incr = pr->u.p.st;
-          limit = parm1 + init - 1;
-
-          if ((last = (limit >= trip)) != 0)
-            limit = trip;
-
-          if (p_st != NULL)
-            *p_st = incr;
-
-          pr->u.p.count += th->th.th_team_nproc;
-
-          if (incr == 1) {
-            *p_lb = start + init;
-            *p_ub = start + limit;
-          } else {
-            *p_lb = start + init * incr;
-            *p_ub = start + limit * incr;
-          }
-
-          if (pr->ordered) {
-            pr->u.p.ordered_lower = init;
-            pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
-            {
-              char *buff;
-              // create format specifiers before the debug output
-              buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
-                                      "ordered_lower:%%%s ordered_upper:%%%s\n",
-                                      traits_t<UT>::spec, traits_t<UT>::spec);
-              KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
-                              pr->u.p.ordered_upper));
-              __kmp_str_free(&buff);
-            }
-#endif
-          } // if
-        } // if
-      } // case
-      break;
-
-      case kmp_sch_dynamic_chunked: {
-        T chunk = pr->u.p.parm1;
-
-        KD_TRACE(
-            100,
-            ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
-
-        init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
-        trip = pr->u.p.tc - 1;
-
-        if ((status = (init <= trip)) == 0) {
-          *p_lb = 0;
-          *p_ub = 0;
-          if (p_st != NULL)
-            *p_st = 0;
-        } else {
-          start = pr->u.p.lb;
-          limit = chunk + init - 1;
-          incr = pr->u.p.st;
-
-          if ((last = (limit >= trip)) != 0)
-            limit = trip;
-
-          if (p_st != NULL)
-            *p_st = incr;
-
-          if (incr == 1) {
-            *p_lb = start + init;
-            *p_ub = start + limit;
-          } else {
-            *p_lb = start + init * incr;
-            *p_ub = start + limit * incr;
-          }
-
-          if (pr->ordered) {
-            pr->u.p.ordered_lower = init;
-            pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
-            {
-              char *buff;
-              // create format specifiers before the debug output
-              buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
-                                      "ordered_lower:%%%s ordered_upper:%%%s\n",
-                                      traits_t<UT>::spec, traits_t<UT>::spec);
-              KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
-                              pr->u.p.ordered_upper));
-              __kmp_str_free(&buff);
-            }
-#endif
-          } // if
-        } // if
-      } // case
-      break;
-
-      case kmp_sch_guided_iterative_chunked: {
-        T chunkspec = pr->u.p.parm1;
-        KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
-                       "iterative case\n",
-                       gtid));
-        trip = pr->u.p.tc;
-        // Start atomic part of calculations
-        while (1) {
-          ST remaining; // signed, because can be < 0
-          init = sh->u.s.iteration; // shared value
-          remaining = trip - init;
-          if (remaining <= 0) { // AC: need to compare with 0 first
-            // nothing to do, don't try atomic op
-            status = 0;
-            break;
-          }
-          if ((T)remaining <
-              pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
-            // use dynamic-style shcedule
-            // atomically inrement iterations, get old value
-            init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
-                                     (ST)chunkspec);
-            remaining = trip - init;
-            if (remaining <= 0) {
-              status = 0; // all iterations got by other threads
-            } else { // got some iterations to work on
-              status = 1;
-              if ((T)remaining > chunkspec) {
-                limit = init + chunkspec - 1;
-              } else {
-                last = 1; // the last chunk
-                limit = init + remaining - 1;
-              } // if
-            } // if
-            break;
-          } // if
-          limit = init + (UT)(remaining *
-                              *(double *)&pr->u.p.parm3); // divide by K*nproc
-          if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
-                                   (ST)init, (ST)limit)) {
-            // CAS was successful, chunk obtained
-            status = 1;
-            --limit;
-            break;
-          } // if
-        } // while
-        if (status != 0) {
-          start = pr->u.p.lb;
-          incr = pr->u.p.st;
-          if (p_st != NULL)
-            *p_st = incr;
-          *p_lb = start + init * incr;
-          *p_ub = start + limit * incr;
-          if (pr->ordered) {
-            pr->u.p.ordered_lower = init;
-            pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
-            {
-              char *buff;
-              // create format specifiers before the debug output
-              buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
-                                      "ordered_lower:%%%s ordered_upper:%%%s\n",
-                                      traits_t<UT>::spec, traits_t<UT>::spec);
-              KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
-                              pr->u.p.ordered_upper));
-              __kmp_str_free(&buff);
-            }
-#endif
-          } // if
-        } else {
-          *p_lb = 0;
-          *p_ub = 0;
-          if (p_st != NULL)
-            *p_st = 0;
-        } // if
-      } // case
-      break;
-
-      case kmp_sch_guided_simd: {
-        // same as iterative but curr-chunk adjusted to be multiple of given
-        // chunk
-        T chunk = pr->u.p.parm1;
-        KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n",
-                       gtid));
-        trip = pr->u.p.tc;
-        // Start atomic part of calculations
-        while (1) {
-          ST remaining; // signed, because can be < 0
-          init = sh->u.s.iteration; // shared value
-          remaining = trip - init;
-          if (remaining <= 0) { // AC: need to compare with 0 first
-            status = 0; // nothing to do, don't try atomic op
-            break;
-          }
-          KMP_DEBUG_ASSERT(init % chunk == 0);
-          // compare with K*nproc*(chunk+1), K=2 by default
-          if ((T)remaining < pr->u.p.parm2) {
-            // use dynamic-style shcedule
-            // atomically inrement iterations, get old value
-            init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
-                                     (ST)chunk);
-            remaining = trip - init;
-            if (remaining <= 0) {
-              status = 0; // all iterations got by other threads
-            } else {
-              // got some iterations to work on
-              status = 1;
-              if ((T)remaining > chunk) {
-                limit = init + chunk - 1;
-              } else {
-                last = 1; // the last chunk
-                limit = init + remaining - 1;
-              } // if
-            } // if
-            break;
-          } // if
-          // divide by K*nproc
-          UT span = remaining * (*(double *)&pr->u.p.parm3);
-          UT rem = span % chunk;
-          if (rem) // adjust so that span%chunk == 0
-            span += chunk - rem;
-          limit = init + span;
-          if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
-                                   (ST)init, (ST)limit)) {
-            // CAS was successful, chunk obtained
-            status = 1;
-            --limit;
-            break;
-          } // if
-        } // while
-        if (status != 0) {
-          start = pr->u.p.lb;
-          incr = pr->u.p.st;
-          if (p_st != NULL)
-            *p_st = incr;
-          *p_lb = start + init * incr;
-          *p_ub = start + limit * incr;
-          if (pr->ordered) {
-            pr->u.p.ordered_lower = init;
-            pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
-            {
-              char *buff;
-              // create format specifiers before the debug output
-              buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
-                                      "ordered_lower:%%%s ordered_upper:%%%s\n",
-                                      traits_t<UT>::spec, traits_t<UT>::spec);
-              KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
-                              pr->u.p.ordered_upper));
-              __kmp_str_free(&buff);
-            }
-#endif
-          } // if
-        } else {
-          *p_lb = 0;
-          *p_ub = 0;
-          if (p_st != NULL)
-            *p_st = 0;
-        } // if
-      } // case
-      break;
-
-      case kmp_sch_guided_analytical_chunked: {
-        T chunkspec = pr->u.p.parm1;
-        UT chunkIdx;
-#if KMP_OS_WINDOWS && KMP_ARCH_X86
-        /* for storing original FPCW value for Windows* OS on
-           IA-32 architecture 8-byte version */
-        unsigned int oldFpcw;
-        unsigned int fpcwSet = 0;
-#endif
-        KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
-                       "analytical case\n",
-                       gtid));
-
-        trip = pr->u.p.tc;
-
-        KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
-        KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
-                         trip);
-
-        while (1) { /* this while loop is a safeguard against unexpected zero
-                       chunk sizes */
-          chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
-          if (chunkIdx >= (UT)pr->u.p.parm2) {
-            --trip;
-            /* use dynamic-style scheduling */
-            init = chunkIdx * chunkspec + pr->u.p.count;
-            /* need to verify init > 0 in case of overflow in the above
-             * calculation */
-            if ((status = (init > 0 && init <= trip)) != 0) {
-              limit = init + chunkspec - 1;
-
-              if ((last = (limit >= trip)) != 0)
-                limit = trip;
-            }
-            break;
-          } else {
-/* use exponential-style scheduling */
-/* The following check is to workaround the lack of long double precision on
-   Windows* OS.
-   This check works around the possible effect that init != 0 for chunkIdx == 0.
- */
-#if KMP_OS_WINDOWS && KMP_ARCH_X86
-            /* If we haven't already done so, save original FPCW and set
-               precision to 64-bit, as Windows* OS on IA-32 architecture
-               defaults to 53-bit */
-            if (!fpcwSet) {
-              oldFpcw = _control87(0, 0);
-              _control87(_PC_64, _MCW_PC);
-              fpcwSet = 0x30000;
-            }
-#endif
-            if (chunkIdx) {
-              init = __kmp_dispatch_guided_remaining<T>(
-                  trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
-              KMP_DEBUG_ASSERT(init);
-              init = trip - init;
-            } else
-              init = 0;
-            limit = trip - __kmp_dispatch_guided_remaining<T>(
-                               trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
-            KMP_ASSERT(init <= limit);
-            if (init < limit) {
-              KMP_DEBUG_ASSERT(limit <= trip);
-              --limit;
-              status = 1;
-              break;
-            } // if
-          } // if
-        } // while (1)
-#if KMP_OS_WINDOWS && KMP_ARCH_X86
-        /* restore FPCW if necessary
-           AC: check fpcwSet flag first because oldFpcw can be uninitialized
-           here */
-        if (fpcwSet && (oldFpcw & fpcwSet))
-          _control87(oldFpcw, _MCW_PC);
-#endif
-        if (status != 0) {
-          start = pr->u.p.lb;
-          incr = pr->u.p.st;
-          if (p_st != NULL)
-            *p_st = incr;
-          *p_lb = start + init * incr;
-          *p_ub = start + limit * incr;
-          if (pr->ordered) {
-            pr->u.p.ordered_lower = init;
-            pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
-            {
-              char *buff;
-              // create format specifiers before the debug output
-              buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
-                                      "ordered_lower:%%%s ordered_upper:%%%s\n",
-                                      traits_t<UT>::spec, traits_t<UT>::spec);
-              KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
-                              pr->u.p.ordered_upper));
-              __kmp_str_free(&buff);
-            }
-#endif
-          }
-        } else {
-          *p_lb = 0;
-          *p_ub = 0;
-          if (p_st != NULL)
-            *p_st = 0;
-        }
-      } // case
-      break;
-
-      case kmp_sch_trapezoidal: {
-        UT index;
-        T parm2 = pr->u.p.parm2;
-        T parm3 = pr->u.p.parm3;
-        T parm4 = pr->u.p.parm4;
-        KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
-                       gtid));
-
-        index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
-
-        init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
-        trip = pr->u.p.tc - 1;
-
-        if ((status = ((T)index < parm3 && init <= trip)) == 0) {
-          *p_lb = 0;
-          *p_ub = 0;
-          if (p_st != NULL)
-            *p_st = 0;
-        } else {
-          start = pr->u.p.lb;
-          limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
-          incr = pr->u.p.st;
-
-          if ((last = (limit >= trip)) != 0)
-            limit = trip;
-
-          if (p_st != NULL)
-            *p_st = incr;
-
-          if (incr == 1) {
-            *p_lb = start + init;
-            *p_ub = start + limit;
-          } else {
-            *p_lb = start + init * incr;
-            *p_ub = start + limit * incr;
-          }
-
-          if (pr->ordered) {
-            pr->u.p.ordered_lower = init;
-            pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
-            {
-              char *buff;
-              // create format specifiers before the debug output
-              buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
-                                      "ordered_lower:%%%s ordered_upper:%%%s\n",
-                                      traits_t<UT>::spec, traits_t<UT>::spec);
-              KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
-                              pr->u.p.ordered_upper));
-              __kmp_str_free(&buff);
-            }
-#endif
-          } // if
-        } // if
-      } // case
-      break;
-      default: {
-        status = 0; // to avoid complaints on uninitialized variable use
-        __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
-                    KMP_HNT(GetNewerLibrary), // Hint
-                    __kmp_msg_null // Variadic argument list terminator
-                    );
-      } break;
-      } // switch
-    } // if tc == 0;
-
+#if KMP_USE_HIER_SCHED
+    if (pr->flags.use_hier)
+      status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
+    else
+#endif // KMP_USE_HIER_SCHED
+      status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
+                                                p_st, th->th.th_team_nproc,
+                                                th->th.th_info.ds.ds_tid);
+    // status == 0: no more iterations to execute
     if (status == 0) {
       UT num_done;
 
@@ -2312,11 +2030,14 @@
         buff = __kmp_str_format(
             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
             traits_t<UT>::spec);
-        KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
+        KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
         __kmp_str_free(&buff);
       }
 #endif
 
+#if KMP_USE_HIER_SCHED
+      pr->flags.use_hier = FALSE;
+#endif
       if ((ST)num_done == th->th.th_team_nproc - 1) {
 #if (KMP_STATIC_STEAL_ENABLED)
         if (pr->schedule == kmp_sch_static_steal &&
@@ -2341,7 +2062,7 @@
         sh->u.s.iteration = 0;
 
         /* TODO replace with general release procedure? */
-        if (pr->ordered) {
+        if (pr->flags.ordered) {
           sh->u.s.ordered_iteration = 0;
         }
 
@@ -2380,9 +2101,10 @@
     // create format specifiers before the debug output
     buff = __kmp_str_format(
         "__kmp_dispatch_next: T#%%d normal case: "
-        "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
+        "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
-    KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
+    KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
+                  (p_last ? *p_last : 0), status));
     __kmp_str_free(&buff);
   }
 #endif
@@ -2390,6 +2112,7 @@
   SSC_MARK_DISPATCH_NEXT();
 #endif
   OMPT_LOOP_END;
+  KMP_STATS_LOOP_END;
   return status;
 }
 
@@ -2398,7 +2121,6 @@
                                   kmp_int32 *plastiter, T *plower, T *pupper,
                                   typename traits_t<T>::signed_t incr) {
   typedef typename traits_t<T>::unsigned_t UT;
-  typedef typename traits_t<T>::signed_t ST;
   kmp_uint32 team_id;
   kmp_uint32 nteams;
   UT trip_count;
@@ -2408,6 +2130,7 @@
   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
 #ifdef KMP_DEBUG
+  typedef typename traits_t<T>::signed_t ST;
   {
     char *buff;
     // create format specifiers before the debug output
@@ -2445,7 +2168,7 @@
   nteams = th->th.th_teams_size.nteams;
 #endif
   team_id = team->t.t_master_tid;
-  KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
+  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
 
   // compute global trip count
   if (incr == 1) {
diff --git a/runtime/src/kmp_dispatch.h b/runtime/src/kmp_dispatch.h
new file mode 100644
index 0000000..ff30d67
--- /dev/null
+++ b/runtime/src/kmp_dispatch.h
@@ -0,0 +1,521 @@
+/*
+ * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_DISPATCH_H
+#define KMP_DISPATCH_H
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+// Need to raise Win version from XP to Vista here for support of
+// InterlockedExchange64
+#if defined(_WIN32_WINNT) && defined(_M_IX86)
+#undef _WIN32_WINNT
+#define _WIN32_WINNT 0x0502
+#endif
+
+#include "kmp.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+#if KMP_OS_WINDOWS && KMP_ARCH_X86
+#include <float.h>
+#endif
+
+#if OMPT_SUPPORT
+#include "ompt-internal.h"
+#include "ompt-specific.h"
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+#if KMP_USE_HIER_SCHED
+// Forward declarations of some hierarchical scheduling data structures
+template <typename T> struct kmp_hier_t;
+template <typename T> struct kmp_hier_top_unit_t;
+#endif // KMP_USE_HIER_SCHED
+
+template <typename T> struct dispatch_shared_info_template;
+template <typename T> struct dispatch_private_info_template;
+
+template <typename T>
+extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
+                                          dispatch_private_info_template<T> *pr,
+                                          enum sched_type schedule, T lb, T ub,
+                                          typename traits_t<T>::signed_t st,
+#if USE_ITT_BUILD
+                                          kmp_uint64 *cur_chunk,
+#endif
+                                          typename traits_t<T>::signed_t chunk,
+                                          T nproc, T unit_id);
+template <typename T>
+extern int __kmp_dispatch_next_algorithm(
+    int gtid, dispatch_private_info_template<T> *pr,
+    dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
+    T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
+
+void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
+void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
+
+#if KMP_STATIC_STEAL_ENABLED
+
+// replaces dispatch_private_info{32,64} structures and
+// dispatch_private_info{32,64}_t types
+template <typename T> struct dispatch_private_infoXX_template {
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  UT count; // unsigned
+  T ub;
+  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
+  T lb;
+  ST st; // signed
+  UT tc; // unsigned
+  T static_steal_counter; // for static_steal only; maybe better to put after ub
+
+  /* parm[1-4] are used in different ways by different scheduling algorithms */
+
+  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
+  //    a) parm3 is properly aligned and
+  //    b) all parm1-4 are in the same cache line.
+  // Because of parm1-4 are used together, performance seems to be better
+  // if they are in the same line (not measured though).
+
+  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
+    T parm1;
+    T parm2;
+    T parm3;
+    T parm4;
+  };
+
+  UT ordered_lower; // unsigned
+  UT ordered_upper; // unsigned
+#if KMP_OS_WINDOWS
+  T last_upper;
+#endif /* KMP_OS_WINDOWS */
+};
+
+#else /* KMP_STATIC_STEAL_ENABLED */
+
+// replaces dispatch_private_info{32,64} structures and
+// dispatch_private_info{32,64}_t types
+template <typename T> struct dispatch_private_infoXX_template {
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  T lb;
+  T ub;
+  ST st; // signed
+  UT tc; // unsigned
+
+  T parm1;
+  T parm2;
+  T parm3;
+  T parm4;
+
+  UT count; // unsigned
+
+  UT ordered_lower; // unsigned
+  UT ordered_upper; // unsigned
+#if KMP_OS_WINDOWS
+  T last_upper;
+#endif /* KMP_OS_WINDOWS */
+};
+#endif /* KMP_STATIC_STEAL_ENABLED */
+
+template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
+  // duplicate alignment here, otherwise size of structure is not correct in our
+  // compiler
+  union KMP_ALIGN_CACHE private_info_tmpl {
+    dispatch_private_infoXX_template<T> p;
+    dispatch_private_info64_t p64;
+  } u;
+  enum sched_type schedule; /* scheduling algorithm */
+  kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
+  kmp_uint32 ordered_bumped;
+  // to retain the structure size after making order
+  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
+  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
+  kmp_uint32 type_size;
+#if KMP_USE_HIER_SCHED
+  kmp_int32 hier_id;
+  kmp_hier_top_unit_t<T> *hier_parent;
+  // member functions
+  kmp_int32 get_hier_id() const { return hier_id; }
+  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
+#endif
+  enum cons_type pushed_ws;
+};
+
+// replaces dispatch_shared_info{32,64} structures and
+// dispatch_shared_info{32,64}_t types
+template <typename T> struct dispatch_shared_infoXX_template {
+  typedef typename traits_t<T>::unsigned_t UT;
+  /* chunk index under dynamic, number of idle threads under static-steal;
+     iteration index otherwise */
+  volatile UT iteration;
+  volatile UT num_done;
+  volatile UT ordered_iteration;
+  // to retain the structure size making ordered_iteration scalar
+  UT ordered_dummy[KMP_MAX_ORDERED - 3];
+};
+
+// replaces dispatch_shared_info structure and dispatch_shared_info_t type
+template <typename T> struct dispatch_shared_info_template {
+  typedef typename traits_t<T>::unsigned_t UT;
+  // we need union here to keep the structure size
+  union shared_info_tmpl {
+    dispatch_shared_infoXX_template<UT> s;
+    dispatch_shared_info64_t s64;
+  } u;
+  volatile kmp_uint32 buffer_index;
+#if OMP_45_ENABLED
+  volatile kmp_int32 doacross_buf_idx; // teamwise index
+  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
+  kmp_int32 doacross_num_done; // count finished threads
+#endif
+#if KMP_USE_HIER_SCHED
+  kmp_hier_t<T> *hier;
+#endif
+#if KMP_USE_HWLOC
+  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
+  // machines (> 48 cores). Performance analysis showed that a cache thrash
+  // was occurring and this padding helps alleviate the problem.
+  char padding[64];
+#endif
+};
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#undef USE_TEST_LOCKS
+
+// test_then_add template (general template should NOT be used)
+template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
+
+template <>
+__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
+                                                 kmp_int32 d) {
+  kmp_int32 r;
+  r = KMP_TEST_THEN_ADD32(p, d);
+  return r;
+}
+
+template <>
+__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
+                                                 kmp_int64 d) {
+  kmp_int64 r;
+  r = KMP_TEST_THEN_ADD64(p, d);
+  return r;
+}
+
+// test_then_inc_acq template (general template should NOT be used)
+template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
+
+template <>
+__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
+  kmp_int32 r;
+  r = KMP_TEST_THEN_INC_ACQ32(p);
+  return r;
+}
+
+template <>
+__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
+  kmp_int64 r;
+  r = KMP_TEST_THEN_INC_ACQ64(p);
+  return r;
+}
+
+// test_then_inc template (general template should NOT be used)
+template <typename T> static __forceinline T test_then_inc(volatile T *p);
+
+template <>
+__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
+  kmp_int32 r;
+  r = KMP_TEST_THEN_INC32(p);
+  return r;
+}
+
+template <>
+__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
+  kmp_int64 r;
+  r = KMP_TEST_THEN_INC64(p);
+  return r;
+}
+
+// compare_and_swap template (general template should NOT be used)
+template <typename T>
+static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
+
+template <>
+__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
+                                                    kmp_int32 c, kmp_int32 s) {
+  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
+}
+
+template <>
+__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
+                                                    kmp_int64 c, kmp_int64 s) {
+  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
+}
+
+template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
+  return value >= checker;
+}
+template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
+  return value == checker;
+}
+
+/*
+    Spin wait loop that first does pause, then yield.
+    Waits until function returns non-zero when called with *spinner and check.
+    Does NOT put threads to sleep.
+    Arguments:
+        UT is unsigned 4- or 8-byte type
+        spinner - memory location to check value
+        checker - value which spinner is >, <, ==, etc.
+        pred - predicate function to perform binary comparison of some sort
+#if USE_ITT_BUILD
+        obj -- is higher-level synchronization object to report to ittnotify. It
+        is used to report locks consistently. For example, if lock is acquired
+        immediately, its address is reported to ittnotify via
+        KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
+        and lock routine calls to KMP_WAIT_YIELD(), the later should report the
+        same address, not an address of low-level spinner.
+#endif // USE_ITT_BUILD
+    TODO: make inline function (move to header file for icl)
+*/
+template <typename UT>
+static UT __kmp_wait_yield(volatile UT *spinner, UT checker,
+                           kmp_uint32 (*pred)(UT, UT)
+                               USE_ITT_BUILD_ARG(void *obj)) {
+  // note: we may not belong to a team at this point
+  volatile UT *spin = spinner;
+  UT check = checker;
+  kmp_uint32 spins;
+  kmp_uint32 (*f)(UT, UT) = pred;
+  UT r;
+
+  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
+  KMP_INIT_YIELD(spins);
+  // main wait spin loop
+  while (!f(r = *spin, check)) {
+    KMP_FSYNC_SPIN_PREPARE(obj);
+    /* GEH - remove this since it was accidentally introduced when kmp_wait was
+       split.
+       It causes problems with infinite recursion because of exit lock */
+    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
+        __kmp_abort_thread(); */
+
+    // if we are oversubscribed,
+    // or have waited a bit (and KMP_LIBRARY=throughput, then yield
+    // pause is in the following code
+    KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
+    KMP_YIELD_SPIN(spins);
+  }
+  KMP_FSYNC_SPIN_ACQUIRED(obj);
+  return r;
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+template <typename UT>
+void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+  dispatch_private_info_template<UT> *pr;
+
+  int gtid = *gtid_ref;
+  //    int  cid = *cid_ref;
+  kmp_info_t *th = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(th->th.th_dispatch);
+
+  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
+  if (__kmp_env_consistency_check) {
+    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+        th->th.th_dispatch->th_dispatch_pr_current);
+    if (pr->pushed_ws != ct_none) {
+#if KMP_USE_DYNAMIC_LOCK
+      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
+#else
+      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
+#endif
+    }
+  }
+
+  if (!th->th.th_team->t.t_serialized) {
+    dispatch_shared_info_template<UT> *sh =
+        reinterpret_cast<dispatch_shared_info_template<UT> *>(
+            th->th.th_dispatch->th_dispatch_sh_current);
+    UT lower;
+
+    if (!__kmp_env_consistency_check) {
+      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+          th->th.th_dispatch->th_dispatch_pr_current);
+    }
+    lower = pr->u.p.ordered_lower;
+
+#if !defined(KMP_GOMP_COMPAT)
+    if (__kmp_env_consistency_check) {
+      if (pr->ordered_bumped) {
+        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
+                               ct_ordered_in_pdo, loc_ref,
+                               &p->stack_data[p->w_top]);
+      }
+    }
+#endif /* !defined(KMP_GOMP_COMPAT) */
+
+    KMP_MB();
+#ifdef KMP_DEBUG
+    {
+      char *buff;
+      // create format specifiers before the debug output
+      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
+                              "ordered_iter:%%%s lower:%%%s\n",
+                              traits_t<UT>::spec, traits_t<UT>::spec);
+      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
+      __kmp_str_free(&buff);
+    }
+#endif
+    __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
+                         __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
+    KMP_MB(); /* is this necessary? */
+#ifdef KMP_DEBUG
+    {
+      char *buff;
+      // create format specifiers before the debug output
+      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
+                              "ordered_iter:%%%s lower:%%%s\n",
+                              traits_t<UT>::spec, traits_t<UT>::spec);
+      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
+      __kmp_str_free(&buff);
+    }
+#endif
+  }
+  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
+}
+
+template <typename UT>
+void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+  typedef typename traits_t<UT>::signed_t ST;
+  dispatch_private_info_template<UT> *pr;
+
+  int gtid = *gtid_ref;
+  //    int  cid = *cid_ref;
+  kmp_info_t *th = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(th->th.th_dispatch);
+
+  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
+  if (__kmp_env_consistency_check) {
+    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+        th->th.th_dispatch->th_dispatch_pr_current);
+    if (pr->pushed_ws != ct_none) {
+      __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
+    }
+  }
+
+  if (!th->th.th_team->t.t_serialized) {
+    dispatch_shared_info_template<UT> *sh =
+        reinterpret_cast<dispatch_shared_info_template<UT> *>(
+            th->th.th_dispatch->th_dispatch_sh_current);
+
+    if (!__kmp_env_consistency_check) {
+      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+          th->th.th_dispatch->th_dispatch_pr_current);
+    }
+
+    KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
+#if !defined(KMP_GOMP_COMPAT)
+    if (__kmp_env_consistency_check) {
+      if (pr->ordered_bumped != 0) {
+        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+        /* How to test it? - OM */
+        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
+                               ct_ordered_in_pdo, loc_ref,
+                               &p->stack_data[p->w_top]);
+      }
+    }
+#endif /* !defined(KMP_GOMP_COMPAT) */
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+    pr->ordered_bumped += 1;
+
+    KD_TRACE(1000,
+             ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
+              gtid, pr->ordered_bumped));
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+    /* TODO use general release procedure? */
+    test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+  }
+  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
+}
+
+/* Computes and returns x to the power of y, where y must a non-negative integer
+ */
+template <typename UT>
+static __forceinline long double __kmp_pow(long double x, UT y) {
+  long double s = 1.0L;
+
+  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
+  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
+  while (y) {
+    if (y & 1)
+      s *= x;
+    x *= x;
+    y >>= 1;
+  }
+  return s;
+}
+
+/* Computes and returns the number of unassigned iterations after idx chunks
+   have been assigned
+   (the total number of unassigned iterations in chunks with index greater than
+   or equal to idx).
+   __forceinline seems to be broken so that if we __forceinline this function,
+   the behavior is wrong
+   (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
+*/
+template <typename T>
+static __inline typename traits_t<T>::unsigned_t
+__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
+                                typename traits_t<T>::unsigned_t idx) {
+  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
+     least for ICL 8.1, long double arithmetic may not really have
+     long double precision, even with /Qlong_double.  Currently, we
+     workaround that in the caller code, by manipulating the FPCW for
+     Windows* OS on IA-32 architecture.  The lack of precision is not
+     expected to be a correctness issue, though.
+  */
+  typedef typename traits_t<T>::unsigned_t UT;
+
+  long double x = tc * __kmp_pow<UT>(base, idx);
+  UT r = (UT)x;
+  if (x == r)
+    return r;
+  return r + 1;
+}
+
+// Parameters of the guided-iterative algorithm:
+//   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
+//   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
+// by default n = 2. For example with n = 3 the chunks distribution will be more
+// flat.
+// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
+static const int guided_int_param = 2;
+static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
+#endif // KMP_DISPATCH_H
diff --git a/runtime/src/kmp_dispatch_hier.h b/runtime/src/kmp_dispatch_hier.h
new file mode 100644
index 0000000..8277eaa
--- /dev/null
+++ b/runtime/src/kmp_dispatch_hier.h
@@ -0,0 +1,1090 @@
+#ifndef KMP_DISPATCH_HIER_H
+#define KMP_DISPATCH_HIER_H
+#include "kmp.h"
+#include "kmp_dispatch.h"
+
+// Layer type for scheduling hierarchy
+enum kmp_hier_layer_e {
+  LAYER_THREAD = -1,
+  LAYER_L1,
+  LAYER_L2,
+  LAYER_L3,
+  LAYER_NUMA,
+  LAYER_LOOP,
+  LAYER_LAST
+};
+
+// Convert hierarchy type (LAYER_L1, LAYER_L2, etc.) to C-style string
+static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type) {
+  switch (type) {
+  case kmp_hier_layer_e::LAYER_THREAD:
+    return "THREAD";
+  case kmp_hier_layer_e::LAYER_L1:
+    return "L1";
+  case kmp_hier_layer_e::LAYER_L2:
+    return "L2";
+  case kmp_hier_layer_e::LAYER_L3:
+    return "L3";
+  case kmp_hier_layer_e::LAYER_NUMA:
+    return "NUMA";
+  case kmp_hier_layer_e::LAYER_LOOP:
+    return "WHOLE_LOOP";
+  case kmp_hier_layer_e::LAYER_LAST:
+    return "LAST";
+  }
+  KMP_ASSERT(0);
+  // Appease compilers, should never get here
+  return "ERROR";
+}
+
+// Structure to store values parsed from OMP_SCHEDULE for scheduling hierarchy
+typedef struct kmp_hier_sched_env_t {
+  int size;
+  int capacity;
+  enum sched_type *scheds;
+  kmp_int32 *small_chunks;
+  kmp_int64 *large_chunks;
+  kmp_hier_layer_e *layers;
+  // Append a level of the hierarchy
+  void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer) {
+    if (capacity == 0) {
+      scheds = (enum sched_type *)__kmp_allocate(sizeof(enum sched_type) *
+                                                 kmp_hier_layer_e::LAYER_LAST);
+      small_chunks = (kmp_int32 *)__kmp_allocate(sizeof(kmp_int32) *
+                                                 kmp_hier_layer_e::LAYER_LAST);
+      large_chunks = (kmp_int64 *)__kmp_allocate(sizeof(kmp_int64) *
+                                                 kmp_hier_layer_e::LAYER_LAST);
+      layers = (kmp_hier_layer_e *)__kmp_allocate(sizeof(kmp_hier_layer_e) *
+                                                  kmp_hier_layer_e::LAYER_LAST);
+      capacity = kmp_hier_layer_e::LAYER_LAST;
+    }
+    int current_size = size;
+    KMP_DEBUG_ASSERT(current_size < kmp_hier_layer_e::LAYER_LAST);
+    scheds[current_size] = sched;
+    layers[current_size] = layer;
+    small_chunks[current_size] = chunk;
+    large_chunks[current_size] = (kmp_int64)chunk;
+    size++;
+  }
+  // Sort the hierarchy using selection sort, size will always be small
+  // (less than LAYER_LAST) so it is not necessary to use an nlog(n) algorithm
+  void sort() {
+    if (size <= 1)
+      return;
+    for (int i = 0; i < size; ++i) {
+      int switch_index = i;
+      for (int j = i + 1; j < size; ++j) {
+        if (layers[j] < layers[switch_index])
+          switch_index = j;
+      }
+      if (switch_index != i) {
+        kmp_hier_layer_e temp1 = layers[i];
+        enum sched_type temp2 = scheds[i];
+        kmp_int32 temp3 = small_chunks[i];
+        kmp_int64 temp4 = large_chunks[i];
+        layers[i] = layers[switch_index];
+        scheds[i] = scheds[switch_index];
+        small_chunks[i] = small_chunks[switch_index];
+        large_chunks[i] = large_chunks[switch_index];
+        layers[switch_index] = temp1;
+        scheds[switch_index] = temp2;
+        small_chunks[switch_index] = temp3;
+        large_chunks[switch_index] = temp4;
+      }
+    }
+  }
+  // Free all memory
+  void deallocate() {
+    if (capacity > 0) {
+      __kmp_free(scheds);
+      __kmp_free(layers);
+      __kmp_free(small_chunks);
+      __kmp_free(large_chunks);
+      scheds = NULL;
+      layers = NULL;
+      small_chunks = NULL;
+      large_chunks = NULL;
+    }
+    size = 0;
+    capacity = 0;
+  }
+} kmp_hier_sched_env_t;
+
+extern int __kmp_dispatch_hand_threading;
+extern kmp_hier_sched_env_t __kmp_hier_scheds;
+
+// Sizes of layer arrays bounded by max number of detected L1s, L2s, etc.
+extern int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
+extern int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
+
+extern int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type);
+extern int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type);
+extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,
+                                        kmp_hier_layer_e t2);
+extern void __kmp_dispatch_free_hierarchies(kmp_team_t *team);
+
+template <typename T> struct kmp_hier_shared_bdata_t {
+  typedef typename traits_t<T>::signed_t ST;
+  volatile kmp_uint64 val[2];
+  kmp_int32 status[2];
+  T lb[2];
+  T ub[2];
+  ST st[2];
+  dispatch_shared_info_template<T> sh[2];
+  void zero() {
+    val[0] = val[1] = 0;
+    status[0] = status[1] = 0;
+    lb[0] = lb[1] = 0;
+    ub[0] = ub[1] = 0;
+    st[0] = st[1] = 0;
+    sh[0].u.s.iteration = sh[1].u.s.iteration = 0;
+  }
+  void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus,
+                            kmp_uint64 index) {
+    lb[1 - index] = nlb;
+    ub[1 - index] = nub;
+    st[1 - index] = nst;
+    status[1 - index] = nstatus;
+  }
+  void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index) {
+    lb[1 - index] = nlb;
+    ub[1 - index] = nub;
+    st[1 - index] = nst;
+    status[1 - index] = nstatus;
+    sh[1 - index].u.s.iteration = 0;
+  }
+
+  kmp_int32 get_next_status(kmp_uint64 index) const {
+    return status[1 - index];
+  }
+  T get_next_lb(kmp_uint64 index) const { return lb[1 - index]; }
+  T get_next_ub(kmp_uint64 index) const { return ub[1 - index]; }
+  ST get_next_st(kmp_uint64 index) const { return st[1 - index]; }
+  dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
+    return &(sh[1 - index]);
+  }
+
+  kmp_int32 get_curr_status(kmp_uint64 index) const { return status[index]; }
+  T get_curr_lb(kmp_uint64 index) const { return lb[index]; }
+  T get_curr_ub(kmp_uint64 index) const { return ub[index]; }
+  ST get_curr_st(kmp_uint64 index) const { return st[index]; }
+  dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
+    return &(sh[index]);
+  }
+};
+
+/*
+ * In the barrier implementations, num_active is the number of threads that are
+ * attached to the kmp_hier_top_unit_t structure in the scheduling hierarchy.
+ * bdata is the shared barrier data that resides on the kmp_hier_top_unit_t
+ * structure. tdata is the thread private data that resides on the thread
+ * data structure.
+ *
+ * The reset_shared() method is used to initialize the barrier data on the
+ * kmp_hier_top_unit_t hierarchy structure
+ *
+ * The reset_private() method is used to initialize the barrier data on the
+ * thread's private dispatch buffer structure
+ *
+ * The barrier() method takes an id, which is that thread's id for the
+ * kmp_hier_top_unit_t structure, and implements the barrier.  All threads wait
+ * inside barrier() until all fellow threads who are attached to that
+ * kmp_hier_top_unit_t structure have arrived.
+ */
+
+// Core barrier implementation
+// Can be used in a unit with between 2 to 8 threads
+template <typename T> class core_barrier_impl {
+  static inline kmp_uint64 get_wait_val(int num_active) {
+    kmp_uint64 wait_val;
+    switch (num_active) {
+    case 2:
+      wait_val = 0x0101LL;
+      break;
+    case 3:
+      wait_val = 0x010101LL;
+      break;
+    case 4:
+      wait_val = 0x01010101LL;
+      break;
+    case 5:
+      wait_val = 0x0101010101LL;
+      break;
+    case 6:
+      wait_val = 0x010101010101LL;
+      break;
+    case 7:
+      wait_val = 0x01010101010101LL;
+      break;
+    case 8:
+      wait_val = 0x0101010101010101LL;
+      break;
+    default:
+      // don't use the core_barrier_impl for more than 8 threads
+      KMP_ASSERT(0);
+    }
+    return wait_val;
+  }
+
+public:
+  static void reset_private(kmp_int32 num_active,
+                            kmp_hier_private_bdata_t *tdata);
+  static void reset_shared(kmp_int32 num_active,
+                           kmp_hier_shared_bdata_t<T> *bdata);
+  static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
+                      kmp_hier_private_bdata_t *tdata);
+};
+
+template <typename T>
+void core_barrier_impl<T>::reset_private(kmp_int32 num_active,
+                                         kmp_hier_private_bdata_t *tdata) {
+  tdata->num_active = num_active;
+  tdata->index = 0;
+  tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active);
+}
+template <typename T>
+void core_barrier_impl<T>::reset_shared(kmp_int32 num_active,
+                                        kmp_hier_shared_bdata_t<T> *bdata) {
+  bdata->val[0] = bdata->val[1] = 0LL;
+  bdata->status[0] = bdata->status[1] = 0LL;
+}
+template <typename T>
+void core_barrier_impl<T>::barrier(kmp_int32 id,
+                                   kmp_hier_shared_bdata_t<T> *bdata,
+                                   kmp_hier_private_bdata_t *tdata) {
+  kmp_uint64 current_index = tdata->index;
+  kmp_uint64 next_index = 1 - current_index;
+  kmp_uint64 current_wait_value = tdata->wait_val[current_index];
+  kmp_uint64 next_wait_value =
+      (current_wait_value ? 0 : get_wait_val(tdata->num_active));
+  KD_TRACE(10, ("core_barrier_impl::barrier(): T#%d current_index:%llu "
+                "next_index:%llu curr_wait:%llu next_wait:%llu\n",
+                __kmp_get_gtid(), current_index, next_index, current_wait_value,
+                next_wait_value));
+  char v = (current_wait_value ? 0x1 : 0x0);
+  (RCAST(volatile char *, &(bdata->val[current_index])))[id] = v;
+  __kmp_wait_yield<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
+                               __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
+  tdata->wait_val[current_index] = next_wait_value;
+  tdata->index = next_index;
+}
+
+// Counter barrier implementation
+// Can be used in a unit with arbitrary number of active threads
+template <typename T> class counter_barrier_impl {
+public:
+  static void reset_private(kmp_int32 num_active,
+                            kmp_hier_private_bdata_t *tdata);
+  static void reset_shared(kmp_int32 num_active,
+                           kmp_hier_shared_bdata_t<T> *bdata);
+  static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
+                      kmp_hier_private_bdata_t *tdata);
+};
+
+template <typename T>
+void counter_barrier_impl<T>::reset_private(kmp_int32 num_active,
+                                            kmp_hier_private_bdata_t *tdata) {
+  tdata->num_active = num_active;
+  tdata->index = 0;
+  tdata->wait_val[0] = tdata->wait_val[1] = (kmp_uint64)num_active;
+}
+template <typename T>
+void counter_barrier_impl<T>::reset_shared(kmp_int32 num_active,
+                                           kmp_hier_shared_bdata_t<T> *bdata) {
+  bdata->val[0] = bdata->val[1] = 0LL;
+  bdata->status[0] = bdata->status[1] = 0LL;
+}
+template <typename T>
+void counter_barrier_impl<T>::barrier(kmp_int32 id,
+                                      kmp_hier_shared_bdata_t<T> *bdata,
+                                      kmp_hier_private_bdata_t *tdata) {
+  volatile kmp_int64 *val;
+  kmp_uint64 current_index = tdata->index;
+  kmp_uint64 next_index = 1 - current_index;
+  kmp_uint64 current_wait_value = tdata->wait_val[current_index];
+  kmp_uint64 next_wait_value = current_wait_value + tdata->num_active;
+
+  KD_TRACE(10, ("counter_barrier_impl::barrier(): T#%d current_index:%llu "
+                "next_index:%llu curr_wait:%llu next_wait:%llu\n",
+                __kmp_get_gtid(), current_index, next_index, current_wait_value,
+                next_wait_value));
+  val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index]));
+  KMP_TEST_THEN_INC64(val);
+  __kmp_wait_yield<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
+                               __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
+  tdata->wait_val[current_index] = next_wait_value;
+  tdata->index = next_index;
+}
+
+// Data associated with topology unit within a layer
+// For example, one kmp_hier_top_unit_t corresponds to one L1 cache
+template <typename T> struct kmp_hier_top_unit_t {
+  typedef typename traits_t<T>::signed_t ST;
+  typedef typename traits_t<T>::unsigned_t UT;
+  kmp_int32 active; // number of topology units that communicate with this unit
+  // chunk information (lower/upper bound, stride, etc.)
+  dispatch_private_info_template<T> hier_pr;
+  kmp_hier_top_unit_t<T> *hier_parent; // pointer to parent unit
+  kmp_hier_shared_bdata_t<T> hier_barrier; // shared barrier data for this unit
+
+  kmp_int32 get_hier_id() const { return hier_pr.hier_id; }
+  void reset_shared_barrier() {
+    KMP_DEBUG_ASSERT(active > 0);
+    if (active == 1)
+      return;
+    hier_barrier.zero();
+    if (active >= 2 && active <= 8) {
+      core_barrier_impl<T>::reset_shared(active, &hier_barrier);
+    } else {
+      counter_barrier_impl<T>::reset_shared(active, &hier_barrier);
+    }
+  }
+  void reset_private_barrier(kmp_hier_private_bdata_t *tdata) {
+    KMP_DEBUG_ASSERT(tdata);
+    KMP_DEBUG_ASSERT(active > 0);
+    if (active == 1)
+      return;
+    if (active >= 2 && active <= 8) {
+      core_barrier_impl<T>::reset_private(active, tdata);
+    } else {
+      counter_barrier_impl<T>::reset_private(active, tdata);
+    }
+  }
+  void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata) {
+    KMP_DEBUG_ASSERT(tdata);
+    KMP_DEBUG_ASSERT(active > 0);
+    KMP_DEBUG_ASSERT(id >= 0 && id < active);
+    if (active == 1) {
+      tdata->index = 1 - tdata->index;
+      return;
+    }
+    if (active >= 2 && active <= 8) {
+      core_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
+    } else {
+      counter_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
+    }
+  }
+
+  kmp_int32 get_next_status(kmp_uint64 index) const {
+    return hier_barrier.get_next_status(index);
+  }
+  T get_next_lb(kmp_uint64 index) const {
+    return hier_barrier.get_next_lb(index);
+  }
+  T get_next_ub(kmp_uint64 index) const {
+    return hier_barrier.get_next_ub(index);
+  }
+  ST get_next_st(kmp_uint64 index) const {
+    return hier_barrier.get_next_st(index);
+  }
+  dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
+    return hier_barrier.get_next_sh(index);
+  }
+
+  kmp_int32 get_curr_status(kmp_uint64 index) const {
+    return hier_barrier.get_curr_status(index);
+  }
+  T get_curr_lb(kmp_uint64 index) const {
+    return hier_barrier.get_curr_lb(index);
+  }
+  T get_curr_ub(kmp_uint64 index) const {
+    return hier_barrier.get_curr_ub(index);
+  }
+  ST get_curr_st(kmp_uint64 index) const {
+    return hier_barrier.get_curr_st(index);
+  }
+  dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
+    return hier_barrier.get_curr_sh(index);
+  }
+
+  void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status,
+                            kmp_uint64 index) {
+    hier_barrier.set_next_hand_thread(lb, ub, st, status, index);
+  }
+  void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index) {
+    hier_barrier.set_next(lb, ub, st, status, index);
+  }
+  dispatch_private_info_template<T> *get_my_pr() { return &hier_pr; }
+  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
+  dispatch_private_info_template<T> *get_parent_pr() {
+    return &(hier_parent->hier_pr);
+  }
+
+  kmp_int32 is_active() const { return active; }
+  kmp_int32 get_num_active() const { return active; }
+  void print() {
+    KD_TRACE(
+        10,
+        ("    kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n",
+         active, &hier_pr, hier_pr.u.p.lb, hier_pr.u.p.ub, hier_pr.u.p.st,
+         hier_pr.u.p.tc));
+  }
+};
+
+// Information regarding a single layer within the scheduling hierarchy
+template <typename T> struct kmp_hier_layer_info_t {
+  int num_active; // number of threads active in this level
+  kmp_hier_layer_e type; // LAYER_L1, LAYER_L2, etc.
+  enum sched_type sched; // static, dynamic, guided, etc.
+  typename traits_t<T>::signed_t chunk; // chunk size associated with schedule
+  int length; // length of the kmp_hier_top_unit_t array
+
+  // Print this layer's information
+  void print() {
+    const char *t = __kmp_get_hier_str(type);
+    KD_TRACE(
+        10,
+        ("    kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d "
+         "length:%d\n",
+         num_active, t, sched, chunk, length));
+  }
+};
+
+/*
+ * Structure to implement entire hierarchy
+ *
+ * The hierarchy is kept as an array of arrays to represent the different
+ * layers.  Layer 0 is the lowest layer to layer num_layers - 1 which is the
+ * highest layer.
+ * Example:
+ * [ 2 ] -> [ L3 | L3 ]
+ * [ 1 ] -> [ L2 | L2 | L2 | L2 ]
+ * [ 0 ] -> [ L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1 ]
+ * There is also an array of layer_info_t which has information regarding
+ * each layer
+ */
+template <typename T> struct kmp_hier_t {
+public:
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+
+private:
+  int next_recurse(ident_t *loc, int gtid, kmp_hier_top_unit_t<T> *current,
+                   kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st,
+                   kmp_int32 previous_id, int hier_level) {
+    int status;
+    kmp_info_t *th = __kmp_threads[gtid];
+    auto parent = current->get_parent();
+    bool last_layer = (hier_level == get_num_layers() - 1);
+    KMP_DEBUG_ASSERT(th);
+    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]);
+    KMP_DEBUG_ASSERT(current);
+    KMP_DEBUG_ASSERT(hier_level >= 0);
+    KMP_DEBUG_ASSERT(hier_level < get_num_layers());
+    KMP_DEBUG_ASSERT(tdata);
+    KMP_DEBUG_ASSERT(parent || last_layer);
+
+    KD_TRACE(
+        1, ("kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level));
+
+    T hier_id = (T)current->get_hier_id();
+    // Attempt to grab next iteration range for this level
+    if (previous_id == 0) {
+      KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is master of unit\n",
+                   gtid, hier_level));
+      kmp_int32 contains_last;
+      T my_lb, my_ub;
+      ST my_st;
+      T nproc;
+      dispatch_shared_info_template<T> volatile *my_sh;
+      dispatch_private_info_template<T> *my_pr;
+      if (last_layer) {
+        // last layer below the very top uses the single shared buffer
+        // from the team struct.
+        KD_TRACE(10,
+                 ("kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n",
+                  gtid, hier_level));
+        my_sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
+            th->th.th_dispatch->th_dispatch_sh_current);
+        nproc = (T)get_top_level_nproc();
+      } else {
+        // middle layers use the shared buffer inside the kmp_hier_top_unit_t
+        // structure
+        KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n",
+                      gtid, hier_level));
+        my_sh =
+            parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index);
+        nproc = (T)parent->get_num_active();
+      }
+      my_pr = current->get_my_pr();
+      KMP_DEBUG_ASSERT(my_sh);
+      KMP_DEBUG_ASSERT(my_pr);
+      enum sched_type schedule = get_sched(hier_level);
+      ST chunk = (ST)get_chunk(hier_level);
+      status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh,
+                                                &contains_last, &my_lb, &my_ub,
+                                                &my_st, nproc, hier_id);
+      KD_TRACE(
+          10,
+          ("kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n",
+           gtid, hier_level, status));
+      // When no iterations are found (status == 0) and this is not the last
+      // layer, attempt to go up the hierarchy for more iterations
+      if (status == 0 && !last_layer) {
+        status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub,
+                              &my_st, hier_id, hier_level + 1);
+        KD_TRACE(
+            10,
+            ("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",
+             gtid, hier_level, status));
+        if (status == 1) {
+          kmp_hier_private_bdata_t *upper_tdata =
+              &(th->th.th_hier_bar_data[hier_level + 1]);
+          my_sh = parent->get_curr_sh(upper_tdata->index);
+          KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) about to init\n",
+                        gtid, hier_level));
+          __kmp_dispatch_init_algorithm(loc, gtid, my_pr, schedule,
+                                        parent->get_curr_lb(upper_tdata->index),
+                                        parent->get_curr_ub(upper_tdata->index),
+                                        parent->get_curr_st(upper_tdata->index),
+#if USE_ITT_BUILD
+                                        NULL,
+#endif
+                                        chunk, nproc, hier_id);
+          status = __kmp_dispatch_next_algorithm<T>(
+              gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc,
+              hier_id);
+          if (!status) {
+            KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) status not 1 "
+                          "setting to 2!\n",
+                          gtid, hier_level));
+            status = 2;
+          }
+        }
+      }
+      current->set_next(my_lb, my_ub, my_st, status, tdata->index);
+      // Propagate whether a unit holds the actual global last iteration
+      // The contains_last attribute is sent downwards from the top to the
+      // bottom of the hierarchy via the contains_last flag inside the
+      // private dispatch buffers in the hierarchy's middle layers
+      if (contains_last) {
+        // If the next_algorithm() method returns 1 for p_last and it is the
+        // last layer or our parent contains the last serial chunk, then the
+        // chunk must contain the last serial iteration.
+        if (last_layer || parent->hier_pr.flags.contains_last) {
+          KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr "
+                        "to contain last.\n",
+                        gtid, hier_level));
+          current->hier_pr.flags.contains_last = contains_last;
+        }
+        if (!current->hier_pr.flags.contains_last)
+          contains_last = FALSE;
+      }
+      if (p_last)
+        *p_last = contains_last;
+    } // if master thread of this unit
+    if (hier_level > 0 || !__kmp_dispatch_hand_threading) {
+      KD_TRACE(10,
+               ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
+                gtid, hier_level));
+      current->barrier(previous_id, tdata);
+      KD_TRACE(10,
+               ("kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n",
+                gtid, hier_level, current->get_curr_status(tdata->index)));
+    } else {
+      KMP_DEBUG_ASSERT(previous_id == 0);
+      return status;
+    }
+    return current->get_curr_status(tdata->index);
+  }
+
+public:
+  int top_level_nproc;
+  int num_layers;
+  bool valid;
+  int type_size;
+  kmp_hier_layer_info_t<T> *info;
+  kmp_hier_top_unit_t<T> **layers;
+  // Deallocate all memory from this hierarchy
+  void deallocate() {
+    for (int i = 0; i < num_layers; ++i)
+      if (layers[i] != NULL) {
+        __kmp_free(layers[i]);
+      }
+    if (layers != NULL) {
+      __kmp_free(layers);
+      layers = NULL;
+    }
+    if (info != NULL) {
+      __kmp_free(info);
+      info = NULL;
+    }
+    num_layers = 0;
+    valid = false;
+  }
+  // Returns true if reallocation is needed else false
+  bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers,
+                          const enum sched_type *new_scheds,
+                          const ST *new_chunks) const {
+    if (!valid || layers == NULL || info == NULL ||
+        traits_t<T>::type_size != type_size || n != num_layers)
+      return true;
+    for (int i = 0; i < n; ++i) {
+      if (info[i].type != new_layers[i])
+        return true;
+      if (info[i].sched != new_scheds[i])
+        return true;
+      if (info[i].chunk != new_chunks[i])
+        return true;
+    }
+    return false;
+  }
+  // A single thread should call this function while the other threads wait
+  // create a new scheduling hierarchy consisting of new_layers, new_scheds
+  // and new_chunks.  These should come pre-sorted according to
+  // kmp_hier_layer_e value.  This function will try to avoid reallocation
+  // if it can
+  void allocate_hier(int n, const kmp_hier_layer_e *new_layers,
+                     const enum sched_type *new_scheds, const ST *new_chunks) {
+    top_level_nproc = 0;
+    if (!need_to_reallocate(n, new_layers, new_scheds, new_chunks)) {
+      KD_TRACE(
+          10,
+          ("kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n"));
+      for (int i = 0; i < n; ++i) {
+        info[i].num_active = 0;
+        for (int j = 0; j < get_length(i); ++j)
+          layers[i][j].active = 0;
+      }
+      return;
+    }
+    KD_TRACE(10, ("kmp_hier_t<T>::allocate_hier: T#0 full alloc\n"));
+    deallocate();
+    type_size = traits_t<T>::type_size;
+    num_layers = n;
+    info = (kmp_hier_layer_info_t<T> *)__kmp_allocate(
+        sizeof(kmp_hier_layer_info_t<T>) * n);
+    layers = (kmp_hier_top_unit_t<T> **)__kmp_allocate(
+        sizeof(kmp_hier_top_unit_t<T> *) * n);
+    for (int i = 0; i < n; ++i) {
+      int max = 0;
+      kmp_hier_layer_e layer = new_layers[i];
+      info[i].num_active = 0;
+      info[i].type = layer;
+      info[i].sched = new_scheds[i];
+      info[i].chunk = new_chunks[i];
+      max = __kmp_hier_max_units[layer + 1];
+      if (max == 0) {
+        valid = false;
+        KMP_WARNING(HierSchedInvalid, __kmp_get_hier_str(layer));
+        deallocate();
+        return;
+      }
+      info[i].length = max;
+      layers[i] = (kmp_hier_top_unit_t<T> *)__kmp_allocate(
+          sizeof(kmp_hier_top_unit_t<T>) * max);
+      for (int j = 0; j < max; ++j) {
+        layers[i][j].active = 0;
+      }
+    }
+    valid = true;
+  }
+  // loc - source file location
+  // gtid - global thread identifier
+  // pr - this thread's private dispatch buffer (corresponding with gtid)
+  // p_last (return value) - pointer to flag indicating this set of iterations
+  // contains last
+  //          iteration
+  // p_lb (return value) - lower bound for this chunk of iterations
+  // p_ub (return value) - upper bound for this chunk of iterations
+  // p_st (return value) - stride for this chunk of iterations
+  //
+  // Returns 1 if there are more iterations to perform, 0 otherwise
+  int next(ident_t *loc, int gtid, dispatch_private_info_template<T> *pr,
+           kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st) {
+    int status;
+    kmp_int32 contains_last = 0;
+    kmp_info_t *th = __kmp_threads[gtid];
+    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]);
+    auto parent = pr->get_parent();
+    KMP_DEBUG_ASSERT(parent);
+    KMP_DEBUG_ASSERT(th);
+    KMP_DEBUG_ASSERT(tdata);
+    KMP_DEBUG_ASSERT(parent);
+    T nproc = (T)parent->get_num_active();
+    T unit_id = (T)pr->get_hier_id();
+    KD_TRACE(
+        10,
+        ("kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n",
+         gtid, nproc, unit_id));
+    // Handthreading implementation
+    // Each iteration is performed by all threads on last unit (typically
+    // cores/tiles)
+    // e.g., threads 0,1,2,3 all execute iteration 0
+    //       threads 0,1,2,3 all execute iteration 1
+    //       threads 4,5,6,7 all execute iteration 2
+    //       threads 4,5,6,7 all execute iteration 3
+    //       ... etc.
+    if (__kmp_dispatch_hand_threading) {
+      KD_TRACE(10,
+               ("kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n",
+                gtid));
+      if (unit_id == 0) {
+        // For hand threading, the sh buffer on the lowest level is only ever
+        // modified and read by the master thread on that level.  Because of
+        // this, we can always use the first sh buffer.
+        auto sh = &(parent->hier_barrier.sh[0]);
+        KMP_DEBUG_ASSERT(sh);
+        status = __kmp_dispatch_next_algorithm<T>(
+            gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
+        if (!status) {
+          bool done = false;
+          while (!done) {
+            done = true;
+            status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
+                                  p_st, unit_id, 0);
+            if (status == 1) {
+              __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
+                                            parent->get_next_lb(tdata->index),
+                                            parent->get_next_ub(tdata->index),
+                                            parent->get_next_st(tdata->index),
+#if USE_ITT_BUILD
+                                            NULL,
+#endif
+                                            pr->u.p.parm1, nproc, unit_id);
+              sh->u.s.iteration = 0;
+              status = __kmp_dispatch_next_algorithm<T>(
+                  gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc,
+                  unit_id);
+              if (!status) {
+                KD_TRACE(10,
+                         ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
+                          "after next_pr_sh()"
+                          "trying again.\n",
+                          gtid));
+                done = false;
+              }
+            } else if (status == 2) {
+              KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
+                            "trying again.\n",
+                            gtid));
+              done = false;
+            }
+          }
+        }
+        parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);
+      } // if master thread of lowest unit level
+      parent->barrier(pr->get_hier_id(), tdata);
+      if (unit_id != 0) {
+        *p_lb = parent->get_curr_lb(tdata->index);
+        *p_ub = parent->get_curr_ub(tdata->index);
+        *p_st = parent->get_curr_st(tdata->index);
+        status = parent->get_curr_status(tdata->index);
+      }
+    } else {
+      // Normal implementation
+      // Each thread grabs an iteration chunk and executes it (no cooperation)
+      auto sh = parent->get_curr_sh(tdata->index);
+      KMP_DEBUG_ASSERT(sh);
+      status = __kmp_dispatch_next_algorithm<T>(
+          gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
+      KD_TRACE(10,
+               ("kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d "
+                "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n",
+                gtid, status, contains_last, *p_lb, *p_ub, *p_st));
+      if (!status) {
+        bool done = false;
+        while (!done) {
+          done = true;
+          status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
+                                p_st, unit_id, 0);
+          if (status == 1) {
+            sh = parent->get_curr_sh(tdata->index);
+            __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
+                                          parent->get_curr_lb(tdata->index),
+                                          parent->get_curr_ub(tdata->index),
+                                          parent->get_curr_st(tdata->index),
+#if USE_ITT_BUILD
+                                          NULL,
+#endif
+                                          pr->u.p.parm1, nproc, unit_id);
+            status = __kmp_dispatch_next_algorithm<T>(
+                gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
+            if (!status) {
+              KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
+                            "after next_pr_sh()"
+                            "trying again.\n",
+                            gtid));
+              done = false;
+            }
+          } else if (status == 2) {
+            KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
+                          "trying again.\n",
+                          gtid));
+            done = false;
+          }
+        }
+      }
+    }
+    if (contains_last && !parent->hier_pr.flags.contains_last) {
+      KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL resetting "
+                    "contains_last to FALSE\n",
+                    gtid));
+      contains_last = FALSE;
+    }
+    if (p_last)
+      *p_last = contains_last;
+    KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid,
+                  status));
+    return status;
+  }
+  // These functions probe the layer info structure
+  // Returns the type of topology unit given level
+  kmp_hier_layer_e get_type(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].type;
+  }
+  // Returns the schedule type at given level
+  enum sched_type get_sched(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].sched;
+  }
+  // Returns the chunk size at given level
+  ST get_chunk(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].chunk;
+  }
+  // Returns the number of active threads at given level
+  int get_num_active(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].num_active;
+  }
+  // Returns the length of topology unit array at given level
+  int get_length(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].length;
+  }
+  // Returns the topology unit given the level and index
+  kmp_hier_top_unit_t<T> *get_unit(int level, int index) {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    KMP_DEBUG_ASSERT(index >= 0);
+    KMP_DEBUG_ASSERT(index < get_length(level));
+    return &(layers[level][index]);
+  }
+  // Returns the number of layers in the hierarchy
+  int get_num_layers() const { return num_layers; }
+  // Returns the number of threads in the top layer
+  // This is necessary because we don't store a topology unit as
+  // the very top level and the scheduling algorithms need this information
+  int get_top_level_nproc() const { return top_level_nproc; }
+  // Return whether this hierarchy is valid or not
+  bool is_valid() const { return valid; }
+  // Print the hierarchy
+  void print() {
+    KD_TRACE(10, ("kmp_hier_t:\n"));
+    for (int i = num_layers - 1; i >= 0; --i) {
+      KD_TRACE(10, ("Info[%d] = ", i));
+      info[i].print();
+    }
+    for (int i = num_layers - 1; i >= 0; --i) {
+      KD_TRACE(10, ("Layer[%d] =\n", i));
+      for (int j = 0; j < info[i].length; ++j) {
+        layers[i][j].print();
+      }
+    }
+  }
+};
+
+template <typename T>
+void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
+                                   kmp_hier_layer_e *new_layers,
+                                   enum sched_type *new_scheds,
+                                   typename traits_t<T>::signed_t *new_chunks,
+                                   T lb, T ub,
+                                   typename traits_t<T>::signed_t st) {
+  typedef typename traits_t<T>::signed_t ST;
+  typedef typename traits_t<T>::unsigned_t UT;
+  int tid, gtid, num_hw_threads, num_threads_per_layer1, active;
+  int my_buffer_index;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  dispatch_private_info_template<T> *pr;
+  dispatch_shared_info_template<T> volatile *sh;
+  gtid = __kmp_entry_gtid();
+  tid = __kmp_tid_from_gtid(gtid);
+#ifdef KMP_DEBUG
+  KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n",
+                gtid, n));
+  for (int i = 0; i < n; ++i) {
+    const char *layer = __kmp_get_hier_str(new_layers[i]);
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, "
+                  "new_scheds[%d] = %d, new_chunks[%d] = %u\n",
+                  gtid, i, layer, i, (int)new_scheds[i], i, new_chunks[i]));
+  }
+#endif // KMP_DEBUG
+  KMP_DEBUG_ASSERT(n > 0);
+  KMP_DEBUG_ASSERT(new_layers);
+  KMP_DEBUG_ASSERT(new_scheds);
+  KMP_DEBUG_ASSERT(new_chunks);
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  active = !team->t.t_serialized;
+  th->th.th_ident = loc;
+  num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
+  if (!active) {
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. "
+                  "Using normal dispatch functions.\n",
+                  gtid));
+    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+        th->th.th_dispatch->th_disp_buffer);
+    KMP_DEBUG_ASSERT(pr);
+    pr->flags.use_hier = FALSE;
+    pr->flags.contains_last = FALSE;
+    return;
+  }
+  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+  my_buffer_index = th->th.th_dispatch->th_disp_index;
+  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+      &th->th.th_dispatch
+           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
+      &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+  KMP_DEBUG_ASSERT(pr);
+  KMP_DEBUG_ASSERT(sh);
+  pr->flags.use_hier = TRUE;
+  pr->u.p.tc = 0;
+  // Have master allocate the hierarchy
+  if (__kmp_tid_from_gtid(gtid) == 0) {
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "
+                  "hierarchy\n",
+                  gtid, pr, sh));
+    if (sh->hier == NULL) {
+      sh->hier = (kmp_hier_t<T> *)__kmp_allocate(sizeof(kmp_hier_t<T>));
+    }
+    sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks);
+    sh->u.s.iteration = 0;
+  }
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+  // Check to make sure the hierarchy is valid
+  kmp_hier_t<T> *hier = sh->hier;
+  if (!sh->hier->is_valid()) {
+    pr->flags.use_hier = FALSE;
+    return;
+  }
+  // Have threads allocate their thread-private barrier data if it hasn't
+  // already been allocated
+  if (th->th.th_hier_bar_data == NULL) {
+    th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate(
+        sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST);
+  }
+  // Have threads "register" themselves by modifiying the active count for each
+  // level they are involved in. The active count will act as nthreads for that
+  // level regarding the scheduling algorithms
+  for (int i = 0; i < n; ++i) {
+    int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
+    kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
+    // Setup the thread's private dispatch buffer's hierarchy pointers
+    if (i == 0)
+      pr->hier_parent = my_unit;
+    // If this unit is already active, then increment active count and wait
+    if (my_unit->is_active()) {
+      KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
+                    "is already active (%d)\n",
+                    gtid, my_unit, my_unit->active));
+      KMP_TEST_THEN_INC32(&(my_unit->active));
+      break;
+    }
+    // Flag that this unit is active
+    if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit->active), 0, 1)) {
+      // Do not setup parent pointer for top level unit since it has no parent
+      if (i < n - 1) {
+        // Setup middle layer pointers to parents
+        my_unit->get_my_pr()->hier_id =
+            index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
+                                                 hier->get_type(i + 1));
+        int parent_index = __kmp_dispatch_get_index(tid, hier->get_type(i + 1));
+        my_unit->hier_parent = hier->get_unit(i + 1, parent_index);
+      } else {
+        // Setup top layer information (no parent pointers are set)
+        my_unit->get_my_pr()->hier_id =
+            index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
+                                                 kmp_hier_layer_e::LAYER_LOOP);
+        KMP_TEST_THEN_INC32(&(hier->top_level_nproc));
+        my_unit->hier_parent = nullptr;
+      }
+      // Set trip count to 0 so that next() operation will initially climb up
+      // the hierarchy to get more iterations (early exit in next() for tc == 0)
+      my_unit->get_my_pr()->u.p.tc = 0;
+      // Increment this layer's number of active units
+      KMP_TEST_THEN_INC32(&(hier->info[i].num_active));
+      KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
+                    "incrementing num_active\n",
+                    gtid, my_unit));
+    } else {
+      KMP_TEST_THEN_INC32(&(my_unit->active));
+      break;
+    }
+  }
+  // Set this thread's id
+  num_threads_per_layer1 = __kmp_dispatch_get_t1_per_t2(
+      kmp_hier_layer_e::LAYER_THREAD, hier->get_type(0));
+  pr->hier_id = tid % num_threads_per_layer1;
+  // For oversubscribed threads, increment their index within the lowest unit
+  // This is done to prevent having two or more threads with id 0, id 1, etc.
+  if (tid >= num_hw_threads)
+    pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1);
+  KD_TRACE(
+      10, ("__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n",
+           gtid, pr->hier_id));
+
+  pr->flags.contains_last = FALSE;
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+  // Now that the number of active threads at each level is determined,
+  // the barrier data for each unit can be initialized and the last layer's
+  // loop information can be initialized.
+  int prev_id = pr->get_hier_id();
+  for (int i = 0; i < n; ++i) {
+    if (prev_id != 0)
+      break;
+    int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
+    kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
+    // Only master threads of this unit within the hierarchy do initialization
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
+                  gtid, i));
+    my_unit->reset_shared_barrier();
+    my_unit->hier_pr.flags.contains_last = FALSE;
+    // Last layer, initialize the private buffers with entire loop information
+    // Now the next next_algorithim() call will get the first chunk of
+    // iterations properly
+    if (i == n - 1) {
+      __kmp_dispatch_init_algorithm<T>(
+          loc, gtid, my_unit->get_my_pr(), hier->get_sched(i), lb, ub, st,
+#if USE_ITT_BUILD
+          NULL,
+#endif
+          hier->get_chunk(i), hier->get_num_active(i), my_unit->get_hier_id());
+    }
+    prev_id = my_unit->get_hier_id();
+  }
+  // Initialize each layer of the thread's private barrier data
+  kmp_hier_top_unit_t<T> *unit = pr->hier_parent;
+  for (int i = 0; i < n && unit; ++i, unit = unit->get_parent()) {
+    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[i]);
+    unit->reset_private_barrier(tdata);
+  }
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+#ifdef KMP_DEBUG
+  if (__kmp_tid_from_gtid(gtid) == 0) {
+    for (int i = 0; i < n; ++i) {
+      KD_TRACE(10,
+               ("__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n",
+                gtid, i, hier->get_num_active(i)));
+    }
+    hier->print();
+  }
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#endif // KMP_DEBUG
+}
+#endif
diff --git a/runtime/src/kmp_ftn_entry.h b/runtime/src/kmp_ftn_entry.h
index 0f09828..80db92d 100644
--- a/runtime/src/kmp_ftn_entry.h
+++ b/runtime/src/kmp_ftn_entry.h
@@ -599,7 +599,7 @@
 }
 
 #if OMP_45_ENABLED
-int FTN_STDCALL FTN_GET_NUM_PLACES(void) {
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_PLACES)(void) {
 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
   return 0;
 #else
@@ -612,7 +612,7 @@
 #endif
 }
 
-int FTN_STDCALL FTN_GET_PLACE_NUM_PROCS(int place_num) {
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_NUM_PROCS)(int place_num) {
 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
   return 0;
 #else
@@ -637,7 +637,8 @@
 #endif
 }
 
-void FTN_STDCALL FTN_GET_PLACE_PROC_IDS(int place_num, int *ids) {
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_PROC_IDS)(int place_num,
+                                                         int *ids) {
 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
 // Nothing.
 #else
@@ -661,7 +662,7 @@
 #endif
 }
 
-int FTN_STDCALL FTN_GET_PLACE_NUM(void) {
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_NUM)(void) {
 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
   return -1;
 #else
@@ -680,7 +681,7 @@
 #endif
 }
 
-int FTN_STDCALL FTN_GET_PARTITION_NUM_PLACES(void) {
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PARTITION_NUM_PLACES)(void) {
 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
   return 0;
 #else
@@ -708,7 +709,8 @@
 #endif
 }
 
-void FTN_STDCALL FTN_GET_PARTITION_PLACE_NUMS(int *place_nums) {
+void
+    FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PARTITION_PLACE_NUMS)(int *place_nums) {
 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
 // Nothing.
 #else
@@ -1192,7 +1194,7 @@
 
 #if OMP_45_ENABLED
 /* returns the maximum allowed task priority */
-int FTN_STDCALL FTN_GET_MAX_TASK_PRIORITY(void) {
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_TASK_PRIORITY)(void) {
 #ifdef KMP_STUB
   return 0;
 #else
@@ -1291,6 +1293,13 @@
 
 #if OMP_45_ENABLED
 // OMP_4.5 versioned symbols
+KMP_VERSION_SYMBOL(FTN_GET_MAX_TASK_PRIORITY, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_NUM_PLACES, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_PLACE_NUM_PROCS, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_PLACE_PROC_IDS, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_PLACE_NUM, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_PARTITION_NUM_PLACES, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_PARTITION_PLACE_NUMS, 45, "OMP_4.5");
 #endif
 
 #if OMP_50_ENABLED
diff --git a/runtime/src/kmp_ftn_os.h b/runtime/src/kmp_ftn_os.h
index 00115cb..d74388a 100644
--- a/runtime/src/kmp_ftn_os.h
+++ b/runtime/src/kmp_ftn_os.h
@@ -616,5 +616,25 @@
 // All GOMP_4.5 symbols
 #define KMP_API_NAME_GOMP_TASKLOOP GOMP_taskloop
 #define KMP_API_NAME_GOMP_TASKLOOP_ULL GOMP_taskloop_ull
+#define KMP_API_NAME_GOMP_DOACROSS_POST GOMP_doacross_post
+#define KMP_API_NAME_GOMP_DOACROSS_WAIT GOMP_doacross_wait
+#define KMP_API_NAME_GOMP_LOOP_DOACROSS_STATIC_START                           \
+  GOMP_loop_doacross_static_start
+#define KMP_API_NAME_GOMP_LOOP_DOACROSS_DYNAMIC_START                          \
+  GOMP_loop_doacross_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_DOACROSS_GUIDED_START                           \
+  GOMP_loop_doacross_guided_start
+#define KMP_API_NAME_GOMP_LOOP_DOACROSS_RUNTIME_START                          \
+  GOMP_loop_doacross_runtime_start
+#define KMP_API_NAME_GOMP_DOACROSS_ULL_POST GOMP_doacross_ull_post
+#define KMP_API_NAME_GOMP_DOACROSS_ULL_WAIT GOMP_doacross_ull_wait
+#define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_STATIC_START                       \
+  GOMP_loop_ull_doacross_static_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START                      \
+  GOMP_loop_ull_doacross_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_GUIDED_START                       \
+  GOMP_loop_ull_doacross_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START                      \
+  GOMP_loop_ull_doacross_runtime_start
 
 #endif /* KMP_FTN_OS_H */
diff --git a/runtime/src/kmp_global.cpp b/runtime/src/kmp_global.cpp
index 3ba0ec9..f6d929f 100644
--- a/runtime/src/kmp_global.cpp
+++ b/runtime/src/kmp_global.cpp
@@ -13,6 +13,9 @@
 
 #include "kmp.h"
 #include "kmp_affinity.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
 
 kmp_key_t __kmp_gtid_threadprivate_key;
 
@@ -57,8 +60,8 @@
 int __kmp_root_counter = 0;
 int __kmp_version = 0;
 
-volatile kmp_uint32 __kmp_team_counter = 0;
-volatile kmp_uint32 __kmp_task_counter = 0;
+std::atomic<kmp_uint32> __kmp_team_counter = ATOMIC_VAR_INIT(0);
+std::atomic<kmp_uint32> __kmp_task_counter = ATOMIC_VAR_INIT(0);
 
 unsigned int __kmp_init_wait =
     KMP_DEFAULT_INIT_WAIT; /* initial number of spin-tests   */
@@ -148,6 +151,12 @@
     kmp_sch_guided_iterative_chunked; /* default guided scheduling method */
 enum sched_type __kmp_auto =
     kmp_sch_guided_analytical_chunked; /* default auto scheduling method */
+#if KMP_USE_HIER_SCHED
+int __kmp_dispatch_hand_threading = 0;
+int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
+int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
+kmp_hier_sched_env_t __kmp_hier_scheds = {0, 0, NULL, NULL, NULL};
+#endif
 int __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
 #if KMP_USE_MONITOR
 int __kmp_monitor_wakeups = KMP_MIN_MONITOR_WAKEUPS;
@@ -195,7 +204,7 @@
     1, 1024}; // TODO: tune it!
 
 #if KMP_DEBUG_ADAPTIVE_LOCKS
-char *__kmp_speculative_statsfile = "-";
+const char *__kmp_speculative_statsfile = "-";
 #endif
 
 #endif // KMP_USE_ADAPTIVE_LOCKS
@@ -335,8 +344,8 @@
     FALSE; /* TRUE means use atomic update of buffer entry pointer */
 
 char *__kmp_debug_buffer = NULL; /* Debug buffer itself */
-int __kmp_debug_count =
-    0; /* Counter for number of lines printed in buffer so far */
+std::atomic<int> __kmp_debug_count =
+    ATOMIC_VAR_INIT(0); /* number of lines printed in buffer so far */
 int __kmp_debug_buf_warn_chars =
     0; /* Keep track of char increase recommended in warnings */
 /* end rotating debug buffer */
@@ -402,7 +411,7 @@
 volatile kmp_team_t *__kmp_team_pool = NULL;
 
 KMP_ALIGN_CACHE
-volatile int __kmp_thread_pool_active_nth = 0;
+std::atomic<int> __kmp_thread_pool_active_nth = ATOMIC_VAR_INIT(0);
 
 /* -------------------------------------------------
  * GLOBAL/ROOT STATE */
@@ -418,47 +427,47 @@
  * false sharing if the alignment is not large enough for these locks */
 KMP_ALIGN_CACHE_INTERNODE
 
-kmp_bootstrap_lock_t __kmp_initz_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER(
-    __kmp_initz_lock); /* Control initializations */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_initz_lock); /* Control initializations */
 KMP_ALIGN_CACHE_INTERNODE
-kmp_bootstrap_lock_t __kmp_forkjoin_lock; /* control fork/join access */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_forkjoin_lock); /* control fork/join access */
 KMP_ALIGN_CACHE_INTERNODE
-kmp_bootstrap_lock_t __kmp_exit_lock; /* exit() is not always thread-safe */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_exit_lock); /* exit() is not always thread-safe */
 #if KMP_USE_MONITOR
+/* control monitor thread creation */
 KMP_ALIGN_CACHE_INTERNODE
-kmp_bootstrap_lock_t __kmp_monitor_lock; /* control monitor thread creation */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_monitor_lock);
 #endif
 /* used for the hack to allow threadprivate cache and __kmp_threads expansion
    to co-exist */
 KMP_ALIGN_CACHE_INTERNODE
-kmp_bootstrap_lock_t __kmp_tp_cached_lock;
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_tp_cached_lock);
 
 KMP_ALIGN_CACHE_INTERNODE
-kmp_lock_t __kmp_global_lock; /* Control OS/global access */
+KMP_LOCK_INIT(__kmp_global_lock); /* Control OS/global access */
 KMP_ALIGN_CACHE_INTERNODE
 kmp_queuing_lock_t __kmp_dispatch_lock; /* Control dispatch access  */
 KMP_ALIGN_CACHE_INTERNODE
-kmp_lock_t __kmp_debug_lock; /* Control I/O access for KMP_DEBUG */
+KMP_LOCK_INIT(__kmp_debug_lock); /* Control I/O access for KMP_DEBUG */
 #else
 KMP_ALIGN_CACHE
 
-kmp_bootstrap_lock_t __kmp_initz_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER(
-    __kmp_initz_lock); /* Control initializations */
-kmp_bootstrap_lock_t __kmp_forkjoin_lock; /* control fork/join access */
-kmp_bootstrap_lock_t __kmp_exit_lock; /* exit() is not always thread-safe */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_initz_lock); /* Control initializations */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_forkjoin_lock); /* control fork/join access */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_exit_lock); /* exit() is not always thread-safe */
 #if KMP_USE_MONITOR
-kmp_bootstrap_lock_t __kmp_monitor_lock; /* control monitor thread creation */
+/* control monitor thread creation */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_monitor_lock);
 #endif
 /* used for the hack to allow threadprivate cache and __kmp_threads expansion
    to co-exist */
-kmp_bootstrap_lock_t __kmp_tp_cached_lock;
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_tp_cached_lock);
 
 KMP_ALIGN(128)
-kmp_lock_t __kmp_global_lock; /* Control OS/global access */
+KMP_LOCK_INIT(__kmp_global_lock); /* Control OS/global access */
 KMP_ALIGN(128)
 kmp_queuing_lock_t __kmp_dispatch_lock; /* Control dispatch access  */
 KMP_ALIGN(128)
-kmp_lock_t __kmp_debug_lock; /* Control I/O access for KMP_DEBUG */
+KMP_LOCK_INIT(__kmp_debug_lock); /* Control I/O access for KMP_DEBUG */
 #endif
 
 /* ----------------------------------------------- */
diff --git a/runtime/src/kmp_gsupport.cpp b/runtime/src/kmp_gsupport.cpp
index 7aa4a66..297a8f7 100644
--- a/runtime/src/kmp_gsupport.cpp
+++ b/runtime/src/kmp_gsupport.cpp
@@ -108,7 +108,7 @@
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ATOMIC_END)(void) {
   int gtid = __kmp_get_gtid();
-  KA_TRACE(20, ("GOMP_atomic_start: T#%d\n", gtid));
+  KA_TRACE(20, ("GOMP_atomic_end: T#%d\n", gtid));
   __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
 }
 
@@ -456,7 +456,6 @@
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)(void) {
   int gtid = __kmp_get_gtid();
   kmp_info_t *thr;
-  int ompt_team_size = __kmp_team_from_gtid(gtid)->t.t_nproc;
 
   thr = __kmp_threads[gtid];
 
@@ -523,10 +522,12 @@
     int status;                                                                \
     long stride;                                                               \
     int gtid = __kmp_entry_gtid();                                             \
-    MKLOC(loc, #func);                                                         \
-    KA_TRACE(20,                                                               \
-             (#func ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n", \
-              gtid, lb, ub, str, chunk_sz));                                   \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n",  \
+         gtid, lb, ub, str, chunk_sz));                                        \
                                                                                \
     if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
       IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
@@ -544,9 +545,11 @@
       status = 0;                                                              \
     }                                                                          \
                                                                                \
-    KA_TRACE(20,                                                               \
-             (#func " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",   \
-              gtid, *p_lb, *p_ub, status));                                    \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",    \
+         gtid, *p_lb, *p_ub, status));                                         \
     return status;                                                             \
   }
 
@@ -556,10 +559,11 @@
     long stride;                                                               \
     long chunk_sz = 0;                                                         \
     int gtid = __kmp_entry_gtid();                                             \
-    MKLOC(loc, #func);                                                         \
-    KA_TRACE(20,                                                               \
-             (#func ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz %d\n",    \
-              gtid, lb, ub, str, chunk_sz));                                   \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz %d\n", \
+         gtid, lb, ub, str, chunk_sz));                                        \
                                                                                \
     if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
       IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
@@ -576,19 +580,30 @@
       status = 0;                                                              \
     }                                                                          \
                                                                                \
-    KA_TRACE(20,                                                               \
-             (#func " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",   \
-              gtid, *p_lb, *p_ub, status));                                    \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",    \
+         gtid, *p_lb, *p_ub, status));                                         \
     return status;                                                             \
   }
 
+#if OMP_45_ENABLED
+#define KMP_DOACROSS_FINI(status, gtid)                                        \
+  if (!status && __kmp_threads[gtid]->th.th_dispatch->th_doacross_flags) {     \
+    __kmpc_doacross_fini(NULL, gtid);                                          \
+  }
+#else
+#define KMP_DOACROSS_FINI(status, gtid) /* Nothing */
+#endif
+
 #define LOOP_NEXT(func, fini_code)                                             \
   int func(long *p_lb, long *p_ub) {                                           \
     int status;                                                                \
     long stride;                                                               \
     int gtid = __kmp_get_gtid();                                               \
-    MKLOC(loc, #func);                                                         \
-    KA_TRACE(20, (#func ": T#%d\n", gtid));                                    \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d\n", gtid));                            \
                                                                                \
     IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                          \
     fini_code status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,    \
@@ -596,11 +611,13 @@
     if (status) {                                                              \
       *p_ub += (stride > 0) ? 1 : -1;                                          \
     }                                                                          \
+    KMP_DOACROSS_FINI(status, gtid)                                            \
                                                                                \
-    KA_TRACE(20,                                                               \
-             (#func " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, stride 0x%lx, "    \
-                    "returning %d\n",                                          \
-              gtid, *p_lb, *p_ub, stride, status));                            \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, stride 0x%lx, " \
+                       "returning %d\n",                                       \
+         gtid, *p_lb, *p_ub, stride, status));                                 \
     return status;                                                             \
   }
 
@@ -634,6 +651,114 @@
 LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_NEXT),
           { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
 
+#if OMP_45_ENABLED
+#define LOOP_DOACROSS_START(func, schedule)                                    \
+  bool func(unsigned ncounts, long *counts, long chunk_sz, long *p_lb,         \
+            long *p_ub) {                                                      \
+    int status;                                                                \
+    long stride, lb, ub, str;                                                  \
+    int gtid = __kmp_entry_gtid();                                             \
+    struct kmp_dim *dims =                                                     \
+        (struct kmp_dim *)__kmp_allocate(sizeof(struct kmp_dim) * ncounts);    \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    for (unsigned i = 0; i < ncounts; ++i) {                                   \
+      dims[i].lo = 0;                                                          \
+      dims[i].up = counts[i] - 1;                                              \
+      dims[i].st = 1;                                                          \
+    }                                                                          \
+    __kmpc_doacross_init(&loc, gtid, (int)ncounts, dims);                      \
+    lb = 0;                                                                    \
+    ub = counts[0];                                                            \
+    str = 1;                                                                   \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d, ncounts %u, lb 0x%lx, ub 0x%lx, str " \
+                                "0x%lx, chunk_sz "                             \
+                                "0x%lx\n",                                     \
+                  gtid, ncounts, lb, ub, str, chunk_sz));                      \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
+                        (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,        \
+                        (schedule) != kmp_sch_static);                         \
+      status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,            \
+                                 (kmp_int *)p_ub, (kmp_int *)&stride);         \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str);                                       \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+    KMP_DOACROSS_FINI(status, gtid);                                           \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",    \
+         gtid, *p_lb, *p_ub, status));                                         \
+    __kmp_free(dims);                                                          \
+    return status;                                                             \
+  }
+
+#define LOOP_DOACROSS_RUNTIME_START(func, schedule)                            \
+  int func(unsigned ncounts, long *counts, long *p_lb, long *p_ub) {           \
+    int status;                                                                \
+    long stride, lb, ub, str;                                                  \
+    long chunk_sz = 0;                                                         \
+    int gtid = __kmp_entry_gtid();                                             \
+    struct kmp_dim *dims =                                                     \
+        (struct kmp_dim *)__kmp_allocate(sizeof(struct kmp_dim) * ncounts);    \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    for (unsigned i = 0; i < ncounts; ++i) {                                   \
+      dims[i].lo = 0;                                                          \
+      dims[i].up = counts[i] - 1;                                              \
+      dims[i].st = 1;                                                          \
+    }                                                                          \
+    __kmpc_doacross_init(&loc, gtid, (int)ncounts, dims);                      \
+    lb = 0;                                                                    \
+    ub = counts[0];                                                            \
+    str = 1;                                                                   \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz %d\n", \
+         gtid, lb, ub, str, chunk_sz));                                        \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
+                        (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz, TRUE); \
+      status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,            \
+                                 (kmp_int *)p_ub, (kmp_int *)&stride);         \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str);                                       \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+    KMP_DOACROSS_FINI(status, gtid);                                           \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",    \
+         gtid, *p_lb, *p_ub, status));                                         \
+    __kmp_free(dims);                                                          \
+    return status;                                                             \
+  }
+
+LOOP_DOACROSS_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_STATIC_START),
+    kmp_sch_static)
+LOOP_DOACROSS_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_DYNAMIC_START),
+    kmp_sch_dynamic_chunked)
+LOOP_DOACROSS_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_GUIDED_START),
+    kmp_sch_guided_chunked)
+LOOP_DOACROSS_RUNTIME_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_RUNTIME_START),
+    kmp_sch_runtime)
+#endif // OMP_45_ENABLED
+
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_END)(void) {
   int gtid = __kmp_get_gtid();
   KA_TRACE(20, ("GOMP_loop_end: T#%d\n", gtid))
@@ -672,13 +797,11 @@
     long long str2 = up ? ((long long)str) : -((long long)str);                \
     long long stride;                                                          \
     int gtid = __kmp_entry_gtid();                                             \
-    MKLOC(loc, #func);                                                         \
+    MKLOC(loc, KMP_STR(func));                                                 \
                                                                                \
-    KA_TRACE(                                                                  \
-        20,                                                                    \
-        (#func                                                                 \
-         ": T#%d, up %d, lb 0x%llx, ub 0x%llx, str 0x%llx, chunk_sz 0x%llx\n", \
-         gtid, up, lb, ub, str, chunk_sz));                                    \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d, up %d, lb 0x%llx, ub 0x%llx, str "    \
+                                "0x%llx, chunk_sz 0x%llx\n",                   \
+                  gtid, up, lb, ub, str, chunk_sz));                           \
                                                                                \
     if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
       KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                        \
@@ -695,9 +818,11 @@
       status = 0;                                                              \
     }                                                                          \
                                                                                \
-    KA_TRACE(20,                                                               \
-             (#func " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n", \
-              gtid, *p_lb, *p_ub, status));                                    \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n",  \
+         gtid, *p_lb, *p_ub, status));                                         \
     return status;                                                             \
   }
 
@@ -710,13 +835,11 @@
     unsigned long long stride;                                                 \
     unsigned long long chunk_sz = 0;                                           \
     int gtid = __kmp_entry_gtid();                                             \
-    MKLOC(loc, #func);                                                         \
+    MKLOC(loc, KMP_STR(func));                                                 \
                                                                                \
-    KA_TRACE(                                                                  \
-        20,                                                                    \
-        (#func                                                                 \
-         ": T#%d, up %d, lb 0x%llx, ub 0x%llx, str 0x%llx, chunk_sz 0x%llx\n", \
-         gtid, up, lb, ub, str, chunk_sz));                                    \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d, up %d, lb 0x%llx, ub 0x%llx, str "    \
+                                "0x%llx, chunk_sz 0x%llx\n",                   \
+                  gtid, up, lb, ub, str, chunk_sz));                           \
                                                                                \
     if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
       KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                        \
@@ -733,9 +856,11 @@
       status = 0;                                                              \
     }                                                                          \
                                                                                \
-    KA_TRACE(20,                                                               \
-             (#func " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n", \
-              gtid, *p_lb, *p_ub, status));                                    \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n",  \
+         gtid, *p_lb, *p_ub, status));                                         \
     return status;                                                             \
   }
 
@@ -744,8 +869,8 @@
     int status;                                                                \
     long long stride;                                                          \
     int gtid = __kmp_get_gtid();                                               \
-    MKLOC(loc, #func);                                                         \
-    KA_TRACE(20, (#func ": T#%d\n", gtid));                                    \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d\n", gtid));                            \
                                                                                \
     fini_code status =                                                         \
         KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,            \
@@ -754,10 +879,12 @@
       *p_ub += (stride > 0) ? 1 : -1;                                          \
     }                                                                          \
                                                                                \
-    KA_TRACE(20,                                                               \
-             (#func " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, stride 0x%llx, " \
-                    "returning %d\n",                                          \
-              gtid, *p_lb, *p_ub, stride, status));                            \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, stride 0x%llx, "  \
+                   "returning %d\n",                                           \
+         gtid, *p_lb, *p_ub, stride, status));                                 \
     return status;                                                             \
   }
 
@@ -793,6 +920,118 @@
 LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT),
               { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
 
+#if OMP_45_ENABLED
+#define LOOP_DOACROSS_START_ULL(func, schedule)                                \
+  int func(unsigned ncounts, unsigned long long *counts,                       \
+           unsigned long long chunk_sz, unsigned long long *p_lb,              \
+           unsigned long long *p_ub) {                                         \
+    int status;                                                                \
+    long long stride, str, lb, ub;                                             \
+    int gtid = __kmp_entry_gtid();                                             \
+    struct kmp_dim *dims =                                                     \
+        (struct kmp_dim *)__kmp_allocate(sizeof(struct kmp_dim) * ncounts);    \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    for (unsigned i = 0; i < ncounts; ++i) {                                   \
+      dims[i].lo = 0;                                                          \
+      dims[i].up = counts[i] - 1;                                              \
+      dims[i].st = 1;                                                          \
+    }                                                                          \
+    __kmpc_doacross_init(&loc, gtid, (int)ncounts, dims);                      \
+    lb = 0;                                                                    \
+    ub = counts[0];                                                            \
+    str = 1;                                                                   \
+                                                                               \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d, lb 0x%llx, ub 0x%llx, str "           \
+                                "0x%llx, chunk_sz 0x%llx\n",                   \
+                  gtid, lb, ub, str, chunk_sz));                               \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                        \
+                            (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,    \
+                            (schedule) != kmp_sch_static);                     \
+      status =                                                                 \
+          KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,          \
+                                (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);     \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str);                                       \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+    KMP_DOACROSS_FINI(status, gtid);                                           \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n",  \
+         gtid, *p_lb, *p_ub, status));                                         \
+    __kmp_free(dims);                                                          \
+    return status;                                                             \
+  }
+
+#define LOOP_DOACROSS_RUNTIME_START_ULL(func, schedule)                        \
+  int func(unsigned ncounts, unsigned long long *counts,                       \
+           unsigned long long *p_lb, unsigned long long *p_ub) {               \
+    int status;                                                                \
+    unsigned long long stride, str, lb, ub;                                    \
+    unsigned long long chunk_sz = 0;                                           \
+    int gtid = __kmp_entry_gtid();                                             \
+    struct kmp_dim *dims =                                                     \
+        (struct kmp_dim *)__kmp_allocate(sizeof(struct kmp_dim) * ncounts);    \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    for (unsigned i = 0; i < ncounts; ++i) {                                   \
+      dims[i].lo = 0;                                                          \
+      dims[i].up = counts[i] - 1;                                              \
+      dims[i].st = 1;                                                          \
+    }                                                                          \
+    __kmpc_doacross_init(&loc, gtid, (int)ncounts, dims);                      \
+    lb = 0;                                                                    \
+    ub = counts[0];                                                            \
+    str = 1;                                                                   \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d, lb 0x%llx, ub 0x%llx, str "           \
+                                "0x%llx, chunk_sz 0x%llx\n",                   \
+                  gtid, lb, ub, str, chunk_sz));                               \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                        \
+                            (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,    \
+                            TRUE);                                             \
+      status =                                                                 \
+          KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,          \
+                                (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);     \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT((long long)stride == str);                            \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+    KMP_DOACROSS_FINI(status, gtid);                                           \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n",  \
+         gtid, *p_lb, *p_ub, status));                                         \
+    __kmp_free(dims);                                                          \
+    return status;                                                             \
+  }
+
+LOOP_DOACROSS_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_STATIC_START),
+    kmp_sch_static)
+LOOP_DOACROSS_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START),
+    kmp_sch_dynamic_chunked)
+LOOP_DOACROSS_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_GUIDED_START),
+    kmp_sch_guided_chunked)
+LOOP_DOACROSS_RUNTIME_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START),
+    kmp_sch_runtime)
+#endif
+
 // Combined parallel / loop worksharing constructs
 //
 // There are no ull versions (yet).
@@ -801,10 +1040,12 @@
   void func(void (*task)(void *), void *data, unsigned num_threads, long lb,   \
             long ub, long str, long chunk_sz) {                                \
     int gtid = __kmp_entry_gtid();                                             \
-    MKLOC(loc, #func);                                                         \
-    KA_TRACE(20,                                                               \
-             (#func ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n", \
-              gtid, lb, ub, str, chunk_sz));                                   \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n",  \
+         gtid, lb, ub, str, chunk_sz));                                        \
                                                                                \
     ompt_pre();                                                                \
                                                                                \
@@ -828,7 +1069,7 @@
                                                                                \
     ompt_post();                                                               \
                                                                                \
-    KA_TRACE(20, (#func " exit: T#%d\n", gtid));                               \
+    KA_TRACE(20, (KMP_STR(func) " exit: T#%d\n", gtid));                       \
   }
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
@@ -951,7 +1192,6 @@
     ompt_thread_info_t oldInfo;
     kmp_info_t *thread;
     kmp_taskdata_t *taskdata;
-    kmp_taskdata_t *current_task;
     if (ompt_enabled.enabled) {
       // Store the threads states and restore them after the task
       thread = __kmp_threads[gtid];
@@ -1224,10 +1464,12 @@
   void func(void (*task)(void *), void *data, unsigned num_threads, long lb,   \
             long ub, long str, long chunk_sz, unsigned flags) {                \
     int gtid = __kmp_entry_gtid();                                             \
-    MKLOC(loc, #func);                                                         \
-    KA_TRACE(20,                                                               \
-             (#func ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n", \
-              gtid, lb, ub, str, chunk_sz));                                   \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n",  \
+         gtid, lb, ub, str, chunk_sz));                                        \
                                                                                \
     ompt_pre();                                                                \
     if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {                       \
@@ -1253,7 +1495,7 @@
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();                         \
     ompt_post();                                                               \
                                                                                \
-    KA_TRACE(20, (#func " exit: T#%d\n", gtid));                               \
+    KA_TRACE(20, (KMP_STR(func) " exit: T#%d\n", gtid));                       \
   }
 
 PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC),
@@ -1521,6 +1763,52 @@
                   sched, (kmp_uint64)num_tasks, (void *)task_dup);
 }
 
+// 4 byte version of GOMP_doacross_post
+// This verison needs to create a temporary array which converts 4 byte
+// integers into 8 byte integeres
+template <typename T, bool need_conversion = (sizeof(long) == 4)>
+void __kmp_GOMP_doacross_post(T *count);
+
+template <> void __kmp_GOMP_doacross_post<long, true>(long *count) {
+  int gtid = __kmp_entry_gtid();
+  kmp_info_t *th = __kmp_threads[gtid];
+  MKLOC(loc, "GOMP_doacross_post");
+  kmp_int64 num_dims = th->th.th_dispatch->th_doacross_info[0];
+  kmp_int64 *vec =
+      (kmp_int64 *)__kmp_thread_malloc(th, sizeof(kmp_int64) * num_dims);
+  for (kmp_int64 i = 0; i < num_dims; ++i) {
+    vec[i] = (kmp_int64)count[i];
+  }
+  __kmpc_doacross_post(&loc, gtid, vec);
+  __kmp_thread_free(th, vec);
+}
+
+// 8 byte versions of GOMP_doacross_post
+// This version can just pass in the count array directly instead of creating
+// a temporary array
+template <> void __kmp_GOMP_doacross_post<long, false>(long *count) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_doacross_post");
+  __kmpc_doacross_post(&loc, gtid, RCAST(kmp_int64 *, count));
+}
+
+template <typename T> void __kmp_GOMP_doacross_wait(T first, va_list args) {
+  int gtid = __kmp_entry_gtid();
+  kmp_info_t *th = __kmp_threads[gtid];
+  MKLOC(loc, "GOMP_doacross_wait");
+  kmp_int64 num_dims = th->th.th_dispatch->th_doacross_info[0];
+  kmp_int64 *vec =
+      (kmp_int64 *)__kmp_thread_malloc(th, sizeof(kmp_int64) * num_dims);
+  vec[0] = (kmp_int64)first;
+  for (kmp_int64 i = 1; i < num_dims; ++i) {
+    T item = va_arg(args, T);
+    vec[i] = (kmp_int64)item;
+  }
+  __kmpc_doacross_wait(&loc, gtid, vec);
+  __kmp_thread_free(th, vec);
+  return;
+}
+
 #ifdef __cplusplus
 extern "C" {
 #endif // __cplusplus
@@ -1543,7 +1831,33 @@
                                       priority, start, end, step);
 }
 
-#endif
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_POST)(long *count) {
+  __kmp_GOMP_doacross_post(count);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_WAIT)(long first, ...) {
+  va_list args;
+  va_start(args, first);
+  __kmp_GOMP_doacross_wait<long>(first, args);
+  va_end(args);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_ULL_POST)(
+    unsigned long long *count) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_doacross_ull_post");
+  __kmpc_doacross_post(&loc, gtid, RCAST(kmp_int64 *, count));
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_ULL_WAIT)(
+    unsigned long long first, ...) {
+  va_list args;
+  va_start(args, first);
+  __kmp_GOMP_doacross_wait<unsigned long long>(first, args);
+  va_end(args);
+}
+
+#endif // OMP_45_ENABLED
 
 /* The following sections of code create aliases for the GOMP_* functions, then
    create versioned symbols using the assembler directive .symver. This is only
@@ -1653,9 +1967,30 @@
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TEAMS, 40, "GOMP_4.0");
 #endif
 
+// GOMP_4.5 versioned symbols
 #if OMP_45_ENABLED
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKLOOP, 45, "GOMP_4.5");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKLOOP_ULL, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_DOACROSS_POST, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_DOACROSS_WAIT, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_STATIC_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_DYNAMIC_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_GUIDED_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_RUNTIME_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_DOACROSS_ULL_POST, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_DOACROSS_ULL_WAIT, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_STATIC_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_GUIDED_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START, 45,
+                   "GOMP_4.5");
 #endif
 
 #endif // KMP_USE_VERSION_SYMBOLS
diff --git a/runtime/src/kmp_io.cpp b/runtime/src/kmp_io.cpp
index 91ebefe..4f58ea0 100644
--- a/runtime/src/kmp_io.cpp
+++ b/runtime/src/kmp_io.cpp
@@ -152,9 +152,7 @@
 
   if (__kmp_debug_buf && __kmp_debug_buffer != NULL) {
 
-    int dc = (__kmp_debug_buf_atomic ? KMP_TEST_THEN_INC32(&__kmp_debug_count)
-                                     : __kmp_debug_count++) %
-             __kmp_debug_buf_lines;
+    int dc = __kmp_debug_count++ % __kmp_debug_buf_lines;
     char *db = &__kmp_debug_buffer[dc * __kmp_debug_buf_chars];
     int chars = 0;
 
diff --git a/runtime/src/kmp_lock.cpp b/runtime/src/kmp_lock.cpp
index fb9edb1..16834c6 100644
--- a/runtime/src/kmp_lock.cpp
+++ b/runtime/src/kmp_lock.cpp
@@ -19,6 +19,8 @@
 #include "kmp_io.h"
 #include "kmp_itt.h"
 #include "kmp_lock.h"
+#include "kmp_wait_release.h"
+#include "kmp_wrapper_getpid.h"
 
 #include "tsan_annotations.h"
 
@@ -69,7 +71,7 @@
 // entire 8 bytes were allocated for nested locks on all 64-bit platforms.
 
 static kmp_int32 __kmp_get_tas_lock_owner(kmp_tas_lock_t *lck) {
-  return KMP_LOCK_STRIP(TCR_4(lck->lk.poll)) - 1;
+  return KMP_LOCK_STRIP(KMP_ATOMIC_LD_RLX(&lck->lk.poll)) - 1;
 }
 
 static inline bool __kmp_is_tas_lock_nestable(kmp_tas_lock_t *lck) {
@@ -81,15 +83,17 @@
   KMP_MB();
 
 #ifdef USE_LOCK_PROFILE
-  kmp_uint32 curr = KMP_LOCK_STRIP(TCR_4(lck->lk.poll));
+  kmp_uint32 curr = KMP_LOCK_STRIP(lck->lk.poll);
   if ((curr != 0) && (curr != gtid + 1))
     __kmp_printf("LOCK CONTENTION: %p\n", lck);
 /* else __kmp_printf( "." );*/
 #endif /* USE_LOCK_PROFILE */
 
-  if ((lck->lk.poll == KMP_LOCK_FREE(tas)) &&
-      KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(tas),
-                                  KMP_LOCK_BUSY(gtid + 1, tas))) {
+  kmp_int32 tas_free = KMP_LOCK_FREE(tas);
+  kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);
+
+  if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == tas_free &&
+      __kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)) {
     KMP_FSYNC_ACQUIRED(lck);
     return KMP_LOCK_ACQUIRED_FIRST;
   }
@@ -104,10 +108,8 @@
   }
 
   kmp_backoff_t backoff = __kmp_spin_backoff_params;
-  while ((lck->lk.poll != KMP_LOCK_FREE(tas)) ||
-         (!KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(tas),
-                                       KMP_LOCK_BUSY(gtid + 1, tas)))) {
-
+  while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != tas_free ||
+         !__kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)) {
     __kmp_spin_backoff(&backoff);
     if (TCR_4(__kmp_nth) >
         (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
@@ -140,9 +142,10 @@
 }
 
 int __kmp_test_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
-  if ((lck->lk.poll == KMP_LOCK_FREE(tas)) &&
-      KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(tas),
-                                  KMP_LOCK_BUSY(gtid + 1, tas))) {
+  kmp_int32 tas_free = KMP_LOCK_FREE(tas);
+  kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);
+  if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == tas_free &&
+      __kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)) {
     KMP_FSYNC_ACQUIRED(lck);
     return TRUE;
   }
@@ -164,7 +167,7 @@
 
   KMP_FSYNC_RELEASING(lck);
   ANNOTATE_TAS_RELEASED(lck);
-  KMP_ST_REL32(&(lck->lk.poll), KMP_LOCK_FREE(tas));
+  KMP_ATOMIC_ST_REL(&lck->lk.poll, KMP_LOCK_FREE(tas));
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   KMP_YIELD(TCR_4(__kmp_nth) >
@@ -191,11 +194,7 @@
 }
 
 void __kmp_init_tas_lock(kmp_tas_lock_t *lck) {
-  TCW_4(lck->lk.poll, KMP_LOCK_FREE(tas));
-}
-
-static void __kmp_init_tas_lock_with_checks(kmp_tas_lock_t *lck) {
-  __kmp_init_tas_lock(lck);
+  lck->lk.poll = KMP_LOCK_FREE(tas);
 }
 
 void __kmp_destroy_tas_lock(kmp_tas_lock_t *lck) { lck->lk.poll = 0; }
@@ -294,10 +293,6 @@
   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
 }
 
-static void __kmp_init_nested_tas_lock_with_checks(kmp_tas_lock_t *lck) {
-  __kmp_init_nested_tas_lock(lck);
-}
-
 void __kmp_destroy_nested_tas_lock(kmp_tas_lock_t *lck) {
   __kmp_destroy_tas_lock(lck);
   lck->lk.depth_locked = 0;
@@ -507,10 +502,6 @@
   TCW_4(lck->lk.poll, KMP_LOCK_FREE(futex));
 }
 
-static void __kmp_init_futex_lock_with_checks(kmp_futex_lock_t *lck) {
-  __kmp_init_futex_lock(lck);
-}
-
 void __kmp_destroy_futex_lock(kmp_futex_lock_t *lck) { lck->lk.poll = 0; }
 
 static void __kmp_destroy_futex_lock_with_checks(kmp_futex_lock_t *lck) {
@@ -607,10 +598,6 @@
   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
 }
 
-static void __kmp_init_nested_futex_lock_with_checks(kmp_futex_lock_t *lck) {
-  __kmp_init_nested_futex_lock(lck);
-}
-
 void __kmp_destroy_nested_futex_lock(kmp_futex_lock_t *lck) {
   __kmp_destroy_futex_lock(lck);
   lck->lk.depth_locked = 0;
@@ -797,10 +784,6 @@
                              std::memory_order_release);
 }
 
-static void __kmp_init_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
-  __kmp_init_ticket_lock(lck);
-}
-
 void __kmp_destroy_ticket_lock(kmp_ticket_lock_t *lck) {
   std::atomic_store_explicit(&lck->lk.initialized, false,
                              std::memory_order_release);
@@ -952,10 +935,6 @@
   // >= 0 for nestable locks, -1 for simple locks
 }
 
-static void __kmp_init_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
-  __kmp_init_nested_ticket_lock(lck);
-}
-
 void __kmp_destroy_nested_ticket_lock(kmp_ticket_lock_t *lck) {
   __kmp_destroy_ticket_lock(lck);
   std::atomic_store_explicit(&lck->lk.depth_locked, 0,
@@ -984,12 +963,6 @@
 
 // access functions to fields which don't exist for all lock kinds.
 
-static int __kmp_is_ticket_lock_initialized(kmp_ticket_lock_t *lck) {
-  return std::atomic_load_explicit(&lck->lk.initialized,
-                                   std::memory_order_relaxed) &&
-         (lck->lk.self == lck);
-}
-
 static const ident_t *__kmp_get_ticket_lock_location(kmp_ticket_lock_t *lck) {
   return lck->lk.location;
 }
@@ -1574,10 +1547,6 @@
   KA_TRACE(1000, ("__kmp_init_queuing_lock: lock %p initialized\n", lck));
 }
 
-static void __kmp_init_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
-  __kmp_init_queuing_lock(lck);
-}
-
 void __kmp_destroy_queuing_lock(kmp_queuing_lock_t *lck) {
   lck->lk.initialized = NULL;
   lck->lk.location = NULL;
@@ -1703,11 +1672,6 @@
   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
 }
 
-static void
-__kmp_init_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
-  __kmp_init_nested_queuing_lock(lck);
-}
-
 void __kmp_destroy_nested_queuing_lock(kmp_queuing_lock_t *lck) {
   __kmp_destroy_queuing_lock(lck);
   lck->lk.depth_locked = 0;
@@ -1730,10 +1694,6 @@
 
 // access functions to fields which don't exist for all lock kinds.
 
-static int __kmp_is_queuing_lock_initialized(kmp_queuing_lock_t *lck) {
-  return lck == lck->lk.initialized;
-}
-
 static const ident_t *__kmp_get_queuing_lock_location(kmp_queuing_lock_t *lck) {
   return lck->lk.location;
 }
@@ -1868,13 +1828,15 @@
 static kmp_adaptive_lock_info_t liveLocks;
 
 // A lock so we can safely update the list of locks.
-static kmp_bootstrap_lock_t chain_lock;
+static kmp_bootstrap_lock_t chain_lock =
+    KMP_BOOTSTRAP_LOCK_INITIALIZER(chain_lock);
 
 // Initialize the list of stats.
 void __kmp_init_speculative_stats() {
   kmp_adaptive_lock_info_t *lck = &liveLocks;
 
-  memset((void *)&(lck->stats), 0, sizeof(lck->stats));
+  memset(CCAST(kmp_adaptive_lock_statistics_t *, &(lck->stats)), 0,
+         sizeof(lck->stats));
   lck->stats.next = lck;
   lck->stats.prev = lck;
 
@@ -1912,7 +1874,8 @@
 }
 
 static void __kmp_zero_speculative_stats(kmp_adaptive_lock_info_t *lck) {
-  memset((void *)&lck->stats, 0, sizeof(lck->stats));
+  memset(CCAST(kmp_adaptive_lock_statistics_t *, &lck->stats), 0,
+         sizeof(lck->stats));
   __kmp_remember_lock(lck);
 }
 
@@ -1929,8 +1892,6 @@
 }
 
 static void __kmp_accumulate_speculative_stats(kmp_adaptive_lock_info_t *lck) {
-  kmp_adaptive_lock_statistics_t *t = &destroyedStats;
-
   __kmp_acquire_bootstrap_lock(&chain_lock);
 
   __kmp_add_stats(&destroyedStats, lck);
@@ -1958,11 +1919,6 @@
 }
 
 void __kmp_print_speculative_stats() {
-  if (__kmp_user_lock_kind != lk_adaptive)
-    return;
-
-  FILE *statsFile = __kmp_open_stats_file();
-
   kmp_adaptive_lock_statistics_t total = destroyedStats;
   kmp_adaptive_lock_info_t *lck;
 
@@ -1975,6 +1931,10 @@
   kmp_uint32 totalSpeculations = t->successfulSpeculations +
                                  t->hardFailedSpeculations +
                                  t->softFailedSpeculations;
+  if (totalSections <= 0)
+    return;
+
+  FILE *statsFile = __kmp_open_stats_file();
 
   fprintf(statsFile, "Speculative lock statistics (all approximate!)\n");
   fprintf(statsFile, " Lock parameters: \n"
@@ -2249,10 +2209,6 @@
   KA_TRACE(1000, ("__kmp_init_adaptive_lock: lock %p initialized\n", lck));
 }
 
-static void __kmp_init_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
-  __kmp_init_adaptive_lock(lck);
-}
-
 static void __kmp_destroy_adaptive_lock(kmp_adaptive_lock_t *lck) {
 #if KMP_DEBUG_ADAPTIVE_LOCKS
   __kmp_accumulate_speculative_stats(&lck->lk.adaptive);
@@ -2279,7 +2235,7 @@
 /* "DRDPA" means Dynamically Reconfigurable Distributed Polling Area */
 
 static kmp_int32 __kmp_get_drdpa_lock_owner(kmp_drdpa_lock_t *lck) {
-  return TCR_4(lck->lk.owner_id) - 1;
+  return lck->lk.owner_id - 1;
 }
 
 static inline bool __kmp_is_drdpa_lock_nestable(kmp_drdpa_lock_t *lck) {
@@ -2288,13 +2244,12 @@
 
 __forceinline static int
 __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
-  kmp_uint64 ticket =
-      KMP_TEST_THEN_INC64(RCAST(volatile kmp_int64 *, &lck->lk.next_ticket));
-  kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
-  volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls = lck->lk.polls;
+  kmp_uint64 ticket = KMP_ATOMIC_INC(&lck->lk.next_ticket);
+  kmp_uint64 mask = lck->lk.mask; // atomic load
+  std::atomic<kmp_uint64> *polls = lck->lk.polls;
 
 #ifdef USE_LOCK_PROFILE
-  if (TCR_8(polls[ticket & mask].poll) != ticket)
+  if (polls[ticket & mask] != ticket)
     __kmp_printf("LOCK CONTENTION: %p\n", lck);
 /* else __kmp_printf( "." );*/
 #endif /* USE_LOCK_PROFILE */
@@ -2311,7 +2266,7 @@
 
   KMP_FSYNC_PREPARE(lck);
   KMP_INIT_YIELD(spins);
-  while (TCR_8(polls[ticket & mask].poll) < ticket) { // volatile load
+  while (polls[ticket & mask] < ticket) { // atomic load
     // If we are oversubscribed,
     // or have waited a bit (and KMP_LIBRARY=turnaround), then yield.
     // CPU Pause is in the macros for yield.
@@ -2327,8 +2282,8 @@
     // If another thread picks reconfigures the polling area and updates their
     // values, and we get the new value of mask and the old polls pointer, we
     // could access memory beyond the end of the old polling area.
-    mask = TCR_8(lck->lk.mask); // volatile load
-    polls = lck->lk.polls; // volatile load
+    mask = lck->lk.mask; // atomic load
+    polls = lck->lk.polls; // atomic load
   }
 
   // Critical section starts here
@@ -2343,7 +2298,7 @@
   // The >= check is in case __kmp_test_drdpa_lock() allocated the cleanup
   // ticket.
   if ((lck->lk.old_polls != NULL) && (ticket >= lck->lk.cleanup_ticket)) {
-    __kmp_free(CCAST(kmp_base_drdpa_lock::kmp_lock_poll *, lck->lk.old_polls));
+    __kmp_free(lck->lk.old_polls);
     lck->lk.old_polls = NULL;
     lck->lk.cleanup_ticket = 0;
   }
@@ -2353,7 +2308,7 @@
   // previous reconfiguration, let a later thread reconfigure it.
   if (lck->lk.old_polls == NULL) {
     bool reconfigure = false;
-    volatile struct kmp_base_drdpa_lock::kmp_lock_poll *old_polls = polls;
+    std::atomic<kmp_uint64> *old_polls = polls;
     kmp_uint32 num_polls = TCR_4(lck->lk.num_polls);
 
     if (TCR_4(__kmp_nth) >
@@ -2365,9 +2320,9 @@
         num_polls = TCR_4(lck->lk.num_polls);
         mask = 0;
         num_polls = 1;
-        polls = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
-            __kmp_allocate(num_polls * sizeof(*polls));
-        polls[0].poll = ticket;
+        polls = (std::atomic<kmp_uint64> *)__kmp_allocate(num_polls *
+                                                          sizeof(*polls));
+        polls[0] = ticket;
       }
     } else {
       // We are in under/fully subscribed mode.  Check the number of
@@ -2386,11 +2341,11 @@
         // of the old polling area to the new area.  __kmp_allocate()
         // zeroes the memory it allocates, and most of the old area is
         // just zero padding, so we only copy the release counters.
-        polls = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
-            __kmp_allocate(num_polls * sizeof(*polls));
+        polls = (std::atomic<kmp_uint64> *)__kmp_allocate(num_polls *
+                                                          sizeof(*polls));
         kmp_uint32 i;
         for (i = 0; i < old_num_polls; i++) {
-          polls[i].poll = old_polls[i].poll;
+          polls[i].store(old_polls[i]);
         }
       }
     }
@@ -2409,13 +2364,13 @@
                       "lock %p to %d polls\n",
                       ticket, lck, num_polls));
 
-      lck->lk.old_polls = old_polls; // non-volatile store
-      lck->lk.polls = polls; // volatile store
+      lck->lk.old_polls = old_polls;
+      lck->lk.polls = polls; // atomic store
 
       KMP_MB();
 
-      lck->lk.num_polls = num_polls; // non-volatile store
-      lck->lk.mask = mask; // volatile store
+      lck->lk.num_polls = num_polls;
+      lck->lk.mask = mask; // atomic store
 
       KMP_MB();
 
@@ -2423,7 +2378,7 @@
       // to main memory can we update the cleanup ticket field.
       //
       // volatile load / non-volatile store
-      lck->lk.cleanup_ticket = TCR_8(lck->lk.next_ticket);
+      lck->lk.cleanup_ticket = lck->lk.next_ticket;
     }
   }
   return KMP_LOCK_ACQUIRED_FIRST;
@@ -2457,13 +2412,13 @@
 int __kmp_test_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   // First get a ticket, then read the polls pointer and the mask.
   // The polls pointer must be read before the mask!!! (See above)
-  kmp_uint64 ticket = TCR_8(lck->lk.next_ticket); // volatile load
-  volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls = lck->lk.polls;
-  kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
-  if (TCR_8(polls[ticket & mask].poll) == ticket) {
+  kmp_uint64 ticket = lck->lk.next_ticket; // atomic load
+  std::atomic<kmp_uint64> *polls = lck->lk.polls;
+  kmp_uint64 mask = lck->lk.mask; // atomic load
+  if (polls[ticket & mask] == ticket) {
     kmp_uint64 next_ticket = ticket + 1;
-    if (KMP_COMPARE_AND_STORE_ACQ64(&lck->lk.next_ticket, ticket,
-                                    next_ticket)) {
+    if (__kmp_atomic_compare_store_acq(&lck->lk.next_ticket, ticket,
+                                       next_ticket)) {
       KMP_FSYNC_ACQUIRED(lck);
       KA_TRACE(1000, ("__kmp_test_drdpa_lock: ticket #%lld acquired lock %p\n",
                       ticket, lck));
@@ -2502,14 +2457,14 @@
 int __kmp_release_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   // Read the ticket value from the lock data struct, then the polls pointer and
   // the mask.  The polls pointer must be read before the mask!!! (See above)
-  kmp_uint64 ticket = lck->lk.now_serving + 1; // non-volatile load
-  volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls = lck->lk.polls;
-  kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
+  kmp_uint64 ticket = lck->lk.now_serving + 1; // non-atomic load
+  std::atomic<kmp_uint64> *polls = lck->lk.polls; // atomic load
+  kmp_uint64 mask = lck->lk.mask; // atomic load
   KA_TRACE(1000, ("__kmp_release_drdpa_lock: ticket #%lld released lock %p\n",
                   ticket - 1, lck));
   KMP_FSYNC_RELEASING(lck);
   ANNOTATE_DRDPA_RELEASED(lck);
-  KMP_ST_REL64(&(polls[ticket & mask].poll), ticket); // volatile store
+  polls[ticket & mask] = ticket; // atomic store
   return KMP_LOCK_RELEASED;
 }
 
@@ -2538,9 +2493,8 @@
   lck->lk.location = NULL;
   lck->lk.mask = 0;
   lck->lk.num_polls = 1;
-  lck->lk.polls =
-      (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)__kmp_allocate(
-          lck->lk.num_polls * sizeof(*(lck->lk.polls)));
+  lck->lk.polls = (std::atomic<kmp_uint64> *)__kmp_allocate(
+      lck->lk.num_polls * sizeof(*(lck->lk.polls)));
   lck->lk.cleanup_ticket = 0;
   lck->lk.old_polls = NULL;
   lck->lk.next_ticket = 0;
@@ -2552,19 +2506,15 @@
   KA_TRACE(1000, ("__kmp_init_drdpa_lock: lock %p initialized\n", lck));
 }
 
-static void __kmp_init_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
-  __kmp_init_drdpa_lock(lck);
-}
-
 void __kmp_destroy_drdpa_lock(kmp_drdpa_lock_t *lck) {
   lck->lk.initialized = NULL;
   lck->lk.location = NULL;
-  if (lck->lk.polls != NULL) {
-    __kmp_free(CCAST(kmp_base_drdpa_lock::kmp_lock_poll *, lck->lk.polls));
+  if (lck->lk.polls.load() != NULL) {
+    __kmp_free(lck->lk.polls.load());
     lck->lk.polls = NULL;
   }
   if (lck->lk.old_polls != NULL) {
-    __kmp_free(CCAST(kmp_base_drdpa_lock::kmp_lock_poll *, lck->lk.old_polls));
+    __kmp_free(lck->lk.old_polls);
     lck->lk.old_polls = NULL;
   }
   lck->lk.mask = 0;
@@ -2688,10 +2638,6 @@
   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
 }
 
-static void __kmp_init_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
-  __kmp_init_nested_drdpa_lock(lck);
-}
-
 void __kmp_destroy_nested_drdpa_lock(kmp_drdpa_lock_t *lck) {
   __kmp_destroy_drdpa_lock(lck);
   lck->lk.depth_locked = 0;
@@ -2713,10 +2659,6 @@
 
 // access functions to fields which don't exist for all lock kinds.
 
-static int __kmp_is_drdpa_lock_initialized(kmp_drdpa_lock_t *lck) {
-  return lck == lck->lk.initialized;
-}
-
 static const ident_t *__kmp_get_drdpa_lock_location(kmp_drdpa_lock_t *lck) {
   return lck->lk.location;
 }
@@ -2796,6 +2738,10 @@
 
 static void __kmp_destroy_hle_lock(kmp_dyna_lock_t *lck) { TCW_4(*lck, 0); }
 
+static void __kmp_destroy_hle_lock_with_checks(kmp_dyna_lock_t *lck) {
+  TCW_4(*lck, 0);
+}
+
 static void __kmp_acquire_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
   // Use gtid for KMP_LOCK_BUSY if necessary
   if (swap4(lck, KMP_LOCK_BUSY(1, hle)) != KMP_LOCK_FREE(hle)) {
@@ -2845,6 +2791,10 @@
   __kmp_destroy_queuing_lock(lck);
 }
 
+static void __kmp_destroy_rtm_lock_with_checks(kmp_queuing_lock_t *lck) {
+  __kmp_destroy_queuing_lock_with_checks(lck);
+}
+
 static void __kmp_acquire_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
   unsigned retries = 3, status;
   do {
@@ -2933,7 +2883,12 @@
 
 // destroy functions
 #define expand(l, op) 0, (void (*)(kmp_dyna_lock_t *))__kmp_##op##_##l##_lock,
-void (*__kmp_direct_destroy[])(kmp_dyna_lock_t *) = {
+static void (*direct_destroy[])(kmp_dyna_lock_t *) = {
+    __kmp_destroy_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, destroy)};
+#undef expand
+#define expand(l, op)                                                          \
+  0, (void (*)(kmp_dyna_lock_t *))__kmp_destroy_##l##_lock_with_checks,
+static void (*direct_destroy_check[])(kmp_dyna_lock_t *) = {
     __kmp_destroy_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, destroy)};
 #undef expand
 
@@ -2968,6 +2923,7 @@
 #undef expand
 
 // Exposes only one set of jump tables (*lock or *lock_with_checks).
+void (*(*__kmp_direct_destroy))(kmp_dyna_lock_t *) = 0;
 int (*(*__kmp_direct_set))(kmp_dyna_lock_t *, kmp_int32) = 0;
 int (*(*__kmp_direct_unset))(kmp_dyna_lock_t *, kmp_int32) = 0;
 int (*(*__kmp_direct_test))(kmp_dyna_lock_t *, kmp_int32) = 0;
@@ -2976,7 +2932,15 @@
 #define expand(l, op) (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock,
 void (*__kmp_indirect_init[])(kmp_user_lock_p) = {
     KMP_FOREACH_I_LOCK(expand, init)};
-void (*__kmp_indirect_destroy[])(kmp_user_lock_p) = {
+#undef expand
+
+#define expand(l, op) (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock,
+static void (*indirect_destroy[])(kmp_user_lock_p) = {
+    KMP_FOREACH_I_LOCK(expand, destroy)};
+#undef expand
+#define expand(l, op)                                                          \
+  (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock_with_checks,
+static void (*indirect_destroy_check[])(kmp_user_lock_p) = {
     KMP_FOREACH_I_LOCK(expand, destroy)};
 #undef expand
 
@@ -3009,6 +2973,7 @@
 #undef expand
 
 // Exposes only one jump tables (*lock or *lock_with_checks).
+void (*(*__kmp_indirect_destroy))(kmp_user_lock_p) = 0;
 int (*(*__kmp_indirect_set))(kmp_user_lock_p, kmp_int32) = 0;
 int (*(*__kmp_indirect_unset))(kmp_user_lock_p, kmp_int32) = 0;
 int (*(*__kmp_indirect_test))(kmp_user_lock_p, kmp_int32) = 0;
@@ -3238,16 +3203,20 @@
     __kmp_direct_set = direct_set_check;
     __kmp_direct_unset = direct_unset_check;
     __kmp_direct_test = direct_test_check;
+    __kmp_direct_destroy = direct_destroy_check;
     __kmp_indirect_set = indirect_set_check;
     __kmp_indirect_unset = indirect_unset_check;
     __kmp_indirect_test = indirect_test_check;
+    __kmp_indirect_destroy = indirect_destroy_check;
   } else {
     __kmp_direct_set = direct_set;
     __kmp_direct_unset = direct_unset;
     __kmp_direct_test = direct_test;
+    __kmp_direct_destroy = direct_destroy;
     __kmp_indirect_set = indirect_set;
     __kmp_indirect_unset = indirect_unset;
     __kmp_indirect_test = indirect_test;
+    __kmp_indirect_destroy = indirect_destroy;
   }
   // If the user locks have already been initialized, then return. Allow the
   // switch between different KMP_CONSISTENCY_CHECK values, but do not allocate
@@ -3369,6 +3338,67 @@
 
 #else // KMP_USE_DYNAMIC_LOCK
 
+static void __kmp_init_tas_lock_with_checks(kmp_tas_lock_t *lck) {
+  __kmp_init_tas_lock(lck);
+}
+
+static void __kmp_init_nested_tas_lock_with_checks(kmp_tas_lock_t *lck) {
+  __kmp_init_nested_tas_lock(lck);
+}
+
+#if KMP_USE_FUTEX
+static void __kmp_init_futex_lock_with_checks(kmp_futex_lock_t *lck) {
+  __kmp_init_futex_lock(lck);
+}
+
+static void __kmp_init_nested_futex_lock_with_checks(kmp_futex_lock_t *lck) {
+  __kmp_init_nested_futex_lock(lck);
+}
+#endif
+
+static int __kmp_is_ticket_lock_initialized(kmp_ticket_lock_t *lck) {
+  return lck == lck->lk.initialized;
+}
+
+static void __kmp_init_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
+  __kmp_init_ticket_lock(lck);
+}
+
+static void __kmp_init_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
+  __kmp_init_nested_ticket_lock(lck);
+}
+
+static int __kmp_is_queuing_lock_initialized(kmp_queuing_lock_t *lck) {
+  return lck == lck->lk.initialized;
+}
+
+static void __kmp_init_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
+  __kmp_init_queuing_lock(lck);
+}
+
+static void
+__kmp_init_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
+  __kmp_init_nested_queuing_lock(lck);
+}
+
+#if KMP_USE_ADAPTIVE_LOCKS
+static void __kmp_init_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
+  __kmp_init_adaptive_lock(lck);
+}
+#endif
+
+static int __kmp_is_drdpa_lock_initialized(kmp_drdpa_lock_t *lck) {
+  return lck == lck->lk.initialized;
+}
+
+static void __kmp_init_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
+  __kmp_init_drdpa_lock(lck);
+}
+
+static void __kmp_init_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
+  __kmp_init_nested_drdpa_lock(lck);
+}
+
 /* user locks
  * They are implemented as a table of function pointers which are set to the
  * lock functions of the appropriate kind, once that has been determined. */
diff --git a/runtime/src/kmp_lock.h b/runtime/src/kmp_lock.h
index 78cf0cc..220236d 100644
--- a/runtime/src/kmp_lock.h
+++ b/runtime/src/kmp_lock.h
@@ -121,7 +121,7 @@
 
 struct kmp_base_tas_lock {
   // KMP_LOCK_FREE(tas) => unlocked; locked: (gtid+1) of owning thread
-  volatile kmp_int32 poll;
+  std::atomic<kmp_int32> poll;
   kmp_int32 depth_locked; // depth locked, for nested locks only
 };
 
@@ -139,7 +139,7 @@
 //    kmp_tas_lock_t xlock = KMP_TAS_LOCK_INITIALIZER( xlock );
 #define KMP_TAS_LOCK_INITIALIZER(lock)                                         \
   {                                                                            \
-    { KMP_LOCK_FREE(tas), 0 }                                                  \
+    { ATOMIC_VAR_INIT(KMP_LOCK_FREE(tas)), 0 }                                 \
   }
 
 extern int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
@@ -446,17 +446,17 @@
   volatile union kmp_drdpa_lock
       *initialized; // points to the lock union if in initialized state
   ident_t const *location; // Source code location of omp_init_lock().
-  volatile struct kmp_lock_poll { kmp_uint64 poll; } * volatile polls;
-  volatile kmp_uint64 mask; // is 2**num_polls-1 for mod op
+  std::atomic<std::atomic<kmp_uint64> *> polls;
+  std::atomic<kmp_uint64> mask; // is 2**num_polls-1 for mod op
   kmp_uint64 cleanup_ticket; // thread with cleanup ticket
-  volatile struct kmp_lock_poll *old_polls; // will deallocate old_polls
+  std::atomic<kmp_uint64> *old_polls; // will deallocate old_polls
   kmp_uint32 num_polls; // must be power of 2
 
   // next_ticket it needs to exist in a separate cache line, as it is
   // invalidated every time a thread takes a new ticket.
   KMP_ALIGN_CACHE
 
-  volatile kmp_uint64 next_ticket;
+  std::atomic<kmp_uint64> next_ticket;
 
   // now_serving is used to store our ticket value while we hold the lock. It
   // has a slightly different meaning in the DRDPA ticket locks (where it is
@@ -518,6 +518,8 @@
 typedef kmp_ticket_lock_t kmp_bootstrap_lock_t;
 
 #define KMP_BOOTSTRAP_LOCK_INITIALIZER(lock) KMP_TICKET_LOCK_INITIALIZER((lock))
+#define KMP_BOOTSTRAP_LOCK_INIT(lock)                                          \
+  kmp_bootstrap_lock_t lock = KMP_TICKET_LOCK_INITIALIZER(lock)
 
 static inline int __kmp_acquire_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
   return __kmp_acquire_ticket_lock(lck, KMP_GTID_DNE);
@@ -549,6 +551,8 @@
 
 typedef kmp_ticket_lock_t kmp_lock_t;
 
+#define KMP_LOCK_INIT(lock) kmp_lock_t lock = KMP_TICKET_LOCK_INITIALIZER(lock)
+
 static inline int __kmp_acquire_lock(kmp_lock_t *lck, kmp_int32 gtid) {
   return __kmp_acquire_ticket_lock(lck, gtid);
 }
@@ -644,8 +648,8 @@
         KMP_FATAL(LockIsAlreadyOwned, func);                                   \
       }                                                                        \
     }                                                                          \
-    if ((lck->tas.lk.poll != 0) ||                                             \
-        (!KMP_COMPARE_AND_STORE_ACQ32(&(lck->tas.lk.poll), 0, gtid + 1))) {    \
+    if (lck->tas.lk.poll != 0 ||                                               \
+        !__kmp_compare_and_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) {        \
       kmp_uint32 spins;                                                        \
       KMP_FSYNC_PREPARE(lck);                                                  \
       KMP_INIT_YIELD(spins);                                                   \
@@ -655,9 +659,8 @@
       } else {                                                                 \
         KMP_YIELD_SPIN(spins);                                                 \
       }                                                                        \
-      while (                                                                  \
-          (lck->tas.lk.poll != 0) ||                                           \
-          (!KMP_COMPARE_AND_STORE_ACQ32(&(lck->tas.lk.poll), 0, gtid + 1))) {  \
+      while (lck->tas.lk.poll != 0 ||                                          \
+             !__kmp_compare_and_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) {   \
         if (TCR_4(__kmp_nth) >                                                 \
             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
           KMP_YIELD(TRUE);                                                     \
@@ -699,7 +702,7 @@
       }
     }
     return ((lck->tas.lk.poll == 0) &&
-            KMP_COMPARE_AND_STORE_ACQ32(&(lck->tas.lk.poll), 0, gtid + 1));
+            __kmp_compare_and_store_acq(&lck->tas.lk.poll, 0, gtid + 1));
   } else {
     KMP_DEBUG_ASSERT(__kmp_test_user_lock_with_checks_ != NULL);
     return (*__kmp_test_user_lock_with_checks_)(lck, gtid);
@@ -764,7 +767,7 @@
       *depth = KMP_LOCK_ACQUIRED_NEXT;                                         \
     } else {                                                                   \
       if ((lck->tas.lk.poll != 0) ||                                           \
-          (!KMP_COMPARE_AND_STORE_ACQ32(&(lck->tas.lk.poll), 0, gtid + 1))) {  \
+          !__kmp_compare_and_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) {      \
         kmp_uint32 spins;                                                      \
         KMP_FSYNC_PREPARE(lck);                                                \
         KMP_INIT_YIELD(spins);                                                 \
@@ -775,8 +778,7 @@
           KMP_YIELD_SPIN(spins);                                               \
         }                                                                      \
         while ((lck->tas.lk.poll != 0) ||                                      \
-               (!KMP_COMPARE_AND_STORE_ACQ32(&(lck->tas.lk.poll), 0,           \
-                                             gtid + 1))) {                     \
+               !__kmp_compare_and_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) { \
           if (TCR_4(__kmp_nth) >                                               \
               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {           \
             KMP_YIELD(TRUE);                                                   \
@@ -824,7 +826,7 @@
       return ++lck->tas.lk.depth_locked; /* same owner, depth increased */
     }
     retval = ((lck->tas.lk.poll == 0) &&
-              KMP_COMPARE_AND_STORE_ACQ32(&(lck->tas.lk.poll), 0, gtid + 1));
+              __kmp_compare_and_store_acq(&lck->tas.lk.poll, 0, gtid + 1));
     if (retval) {
       KMP_MB();
       lck->tas.lk.depth_locked = 1;
@@ -1141,7 +1143,7 @@
 // Function tables for direct locks. Set/unset/test differentiate functions
 // with/without consistency checking.
 extern void (*__kmp_direct_init[])(kmp_dyna_lock_t *, kmp_dyna_lockseq_t);
-extern void (*__kmp_direct_destroy[])(kmp_dyna_lock_t *);
+extern void (*(*__kmp_direct_destroy))(kmp_dyna_lock_t *);
 extern int (*(*__kmp_direct_set))(kmp_dyna_lock_t *, kmp_int32);
 extern int (*(*__kmp_direct_unset))(kmp_dyna_lock_t *, kmp_int32);
 extern int (*(*__kmp_direct_test))(kmp_dyna_lock_t *, kmp_int32);
@@ -1149,7 +1151,7 @@
 // Function tables for indirect locks. Set/unset/test differentiate functions
 // with/withuot consistency checking.
 extern void (*__kmp_indirect_init[])(kmp_user_lock_p);
-extern void (*__kmp_indirect_destroy[])(kmp_user_lock_p);
+extern void (*(*__kmp_indirect_destroy))(kmp_user_lock_p);
 extern int (*(*__kmp_indirect_set))(kmp_user_lock_p, kmp_int32);
 extern int (*(*__kmp_indirect_unset))(kmp_user_lock_p, kmp_int32);
 extern int (*(*__kmp_indirect_test))(kmp_user_lock_p, kmp_int32);
diff --git a/runtime/src/kmp_omp.h b/runtime/src/kmp_omp.h
index 2d6fd19..8dcb7d2 100644
--- a/runtime/src/kmp_omp.h
+++ b/runtime/src/kmp_omp.h
@@ -74,7 +74,9 @@
   addr_and_size_t threads; // Pointer to __kmp_threads.
   addr_and_size_t roots; // Pointer to __kmp_root.
   addr_and_size_t capacity; // Pointer to __kmp_threads_capacity.
+#if KMP_USE_MONITOR
   addr_and_size_t monitor; // Pointer to __kmp_monitor.
+#endif
 #if !KMP_USE_DYNAMIC_LOCK
   addr_and_size_t lock_table; // Pointer to __kmp_lock_table.
 #endif
diff --git a/runtime/src/kmp_os.h b/runtime/src/kmp_os.h
index 9ad023c..93743ad 100644
--- a/runtime/src/kmp_os.h
+++ b/runtime/src/kmp_os.h
@@ -16,6 +16,7 @@
 
 #include "kmp_config.h"
 #include <stdlib.h>
+#include <atomic>
 
 #define KMP_FTN_PLAIN 1
 #define KMP_FTN_APPEND 2
@@ -312,9 +313,12 @@
 #endif
 
 // Define KMP_VERSION_SYMBOL and KMP_EXPAND_NAME
-#ifdef KMP_USE_VERSION_SYMBOLS
+#ifndef KMP_STR
 #define KMP_STR(x) _KMP_STR(x)
 #define _KMP_STR(x) #x
+#endif
+
+#ifdef KMP_USE_VERSION_SYMBOLS
 // If using versioned symbols, KMP_EXPAND_NAME prepends
 // __kmp_api_ to the real API name
 #define KMP_EXPAND_NAME(api_name) _KMP_EXPAND_NAME(api_name)
@@ -905,6 +909,45 @@
 } // extern "C"
 #endif // __cplusplus
 
+// Macros for C++11 atomic functions
+#define KMP_ATOMIC_LD(p, order) (p)->load(std::memory_order_##order)
+#define KMP_ATOMIC_OP(op, p, v, order) (p)->op(v, std::memory_order_##order)
+
+// For non-default load/store
+#define KMP_ATOMIC_LD_ACQ(p) KMP_ATOMIC_LD(p, acquire)
+#define KMP_ATOMIC_LD_RLX(p) KMP_ATOMIC_LD(p, relaxed)
+#define KMP_ATOMIC_ST_REL(p, v) KMP_ATOMIC_OP(store, p, v, release)
+#define KMP_ATOMIC_ST_RLX(p, v) KMP_ATOMIC_OP(store, p, v, relaxed)
+
+// For non-default fetch_<op>
+#define KMP_ATOMIC_ADD(p, v) KMP_ATOMIC_OP(fetch_add, p, v, acq_rel)
+#define KMP_ATOMIC_SUB(p, v) KMP_ATOMIC_OP(fetch_sub, p, v, acq_rel)
+#define KMP_ATOMIC_AND(p, v) KMP_ATOMIC_OP(fetch_and, p, v, acq_rel)
+#define KMP_ATOMIC_OR(p, v) KMP_ATOMIC_OP(fetch_or, p, v, acq_rel)
+#define KMP_ATOMIC_INC(p) KMP_ATOMIC_OP(fetch_add, p, 1, acq_rel)
+#define KMP_ATOMIC_DEC(p) KMP_ATOMIC_OP(fetch_sub, p, 1, acq_rel)
+#define KMP_ATOMIC_ADD_RLX(p, v) KMP_ATOMIC_OP(fetch_add, p, v, relaxed)
+#define KMP_ATOMIC_INC_RLX(p) KMP_ATOMIC_OP(fetch_add, p, 1, relaxed)
+
+// Callers of the following functions cannot see the side effect on "expected".
+template <typename T>
+bool __kmp_atomic_compare_store(std::atomic<T> *p, T expected, T desired) {
+  return p->compare_exchange_strong(
+      expected, desired, std::memory_order_acq_rel, std::memory_order_relaxed);
+}
+
+template <typename T>
+bool __kmp_atomic_compare_store_acq(std::atomic<T> *p, T expected, T desired) {
+  return p->compare_exchange_strong(
+      expected, desired, std::memory_order_acquire, std::memory_order_relaxed);
+}
+
+template <typename T>
+bool __kmp_atomic_compare_store_rel(std::atomic<T> *p, T expected, T desired) {
+  return p->compare_exchange_strong(
+      expected, desired, std::memory_order_release, std::memory_order_relaxed);
+}
+
 #endif /* KMP_OS_H */
 // Safe C API
 #include "kmp_safe_c_api.h"
diff --git a/runtime/src/kmp_runtime.cpp b/runtime/src/kmp_runtime.cpp
index da5e5bc..0ccf6a5 100644
--- a/runtime/src/kmp_runtime.cpp
+++ b/runtime/src/kmp_runtime.cpp
@@ -24,6 +24,10 @@
 #include "kmp_str.h"
 #include "kmp_wait_release.h"
 #include "kmp_wrapper_getpid.h"
+#include "kmp_dispatch.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
 
 #if OMPT_SUPPORT
 #include "ompt-specific.h"
@@ -63,7 +67,9 @@
 
 /* ------------------------------------------------------------------------ */
 
+#if KMP_USE_MONITOR
 kmp_info_t __kmp_monitor;
+#endif
 
 /* Forward declarations */
 
@@ -754,8 +760,8 @@
        single block */
     /* TODO: Should this be acquire or release? */
     if (team->t.t_construct == old_this) {
-      status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
-                                           th->th.th_local.this_construct);
+      status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
+                                              th->th.th_local.this_construct);
     }
 #if USE_ITT_BUILD
     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
@@ -1599,7 +1605,7 @@
 
       parent_team->t.t_pkfn = microtask;
       parent_team->t.t_invoke = invoker;
-      KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
+      KMP_ATOMIC_INC(&root->r.r_in_parallel);
       parent_team->t.t_active_level++;
       parent_team->t.t_level++;
 
@@ -1638,12 +1644,8 @@
       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
                     parent_team->t.t_id, parent_team->t.t_pkfn));
 
-      {
-        KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
-        KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
-        if (!parent_team->t.t_invoke(gtid)) {
-          KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
-        }
+      if (!parent_team->t.t_invoke(gtid)) {
+        KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
       }
       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
                     parent_team->t.t_id, parent_team->t.t_pkfn));
@@ -1833,11 +1835,7 @@
           //     because initial code in teams should have level=0
           team->t.t_level--;
           // AC: call special invoker for outer "parallel" of teams construct
-          {
-            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
-            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
-            invoker(gtid);
-          }
+          invoker(gtid);
         } else {
 #endif /* OMP_40_ENABLED */
           argv = args;
@@ -1956,7 +1954,7 @@
 #endif /* OMP_40_ENABLED */
     {
       /* Increment our nested depth level */
-      KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
+      KMP_ATOMIC_INC(&root->r.r_in_parallel);
     }
 
     // See if we need to make a copy of the ICVs.
@@ -2250,12 +2248,8 @@
                   team->t.t_id, team->t.t_pkfn));
   } // END of timer KMP_fork_call block
 
-  {
-    KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
-    KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
-    if (!team->t.t_invoke(gtid)) {
-      KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
-    }
+  if (!team->t.t_invoke(gtid)) {
+    KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
   }
   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
                 team->t.t_id, team->t.t_pkfn));
@@ -2433,7 +2427,7 @@
     /* Decrement our nested depth level */
     team->t.t_level--;
     team->t.t_active_level--;
-    KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
+    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
 
     /* Restore number of threads in the team if needed */
     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
@@ -2491,7 +2485,7 @@
 #endif /* OMP_40_ENABLED */
   {
     /* Decrement our nested depth level */
-    KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
+    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
   }
   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
 
@@ -3070,6 +3064,9 @@
       team->t.t_dispatch[i].th_disp_buffer = NULL;
     }
   }
+#if KMP_USE_HIER_SCHED
+  __kmp_dispatch_free_hierarchies(team);
+#endif
   __kmp_free(team->t.t_threads);
   __kmp_free(team->t.t_disp_buffer);
   __kmp_free(team->t.t_dispatch);
@@ -3388,7 +3385,8 @@
                                      root->r.r_uber_thread);
         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
         __kmp_printf("    Nested?:      %2d\n", root->r.r_nested);
-        __kmp_printf("    In Parallel:  %2d\n", root->r.r_in_parallel);
+        __kmp_printf("    In Parallel:  %2d\n",
+                     KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
         __kmp_printf("\n");
         __kmp_print_structure_team_accum(list, root->r.r_root_team);
         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
@@ -3690,7 +3688,7 @@
 #if KMP_STATS_ENABLED
   // Initialize stats as soon as possible (right after gtid assignment).
   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
-  KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
+  __kmp_stats_thread_ptr->startLife();
   KMP_SET_THREAD_STATE(SERIAL_REGION);
   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
 #endif
@@ -4009,10 +4007,11 @@
   kmp_team_t *steam = this_thr->th.th_serial_team;
   kmp_team_t *team = this_thr->th.th_team;
 
-  __kmp_printf("__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p "
-               "ptask=%p\n",
-               gtid, tid, this_thr, team, this_thr->th.th_current_task,
-               team->t.t_implicit_task_taskdata[tid].td_parent);
+  __kmp_printf(
+      "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
+      "ptask=%p\n",
+      gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
+      team->t.t_implicit_task_taskdata[tid].td_parent);
 }
 #endif // KMP_DEBUG
 
@@ -4324,6 +4323,9 @@
 
   new_thr->th.th_spin_here = FALSE;
   new_thr->th.th_next_waiting = 0;
+#if KMP_OS_UNIX
+  new_thr->th.th_blocking = false;
+#endif
 
 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
@@ -4448,7 +4450,9 @@
 #ifdef KMP_DEBUG
   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
 #endif
+#if KMP_OS_WINDOWS
   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
+#endif
 
   team->t.t_control_stack_top = NULL;
 
@@ -5346,7 +5350,9 @@
   /* team is done working */
   TCW_SYNC_PTR(team->t.t_pkfn,
                NULL); // Important for Debugging Support Library.
+#if KMP_OS_WINDOWS
   team->t.t_copyin_counter = 0; // init counter for possible reuse
+#endif
   // Do not reset pointer to parent team to NULL for hot teams.
 
   /* if we are non-hot team, release our threads */
@@ -5622,11 +5628,7 @@
         }
 #endif
 
-        {
-          KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
-          KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
-          rc = (*pteam)->t.t_invoke(gtid);
-        }
+        rc = (*pteam)->t.t_invoke(gtid);
         KMP_ASSERT(rc);
 
         KMP_MB();
@@ -5783,8 +5785,8 @@
     // so there are no harmful side effects.
     if (thread->th.th_active_in_pool) {
       thread->th.th_active_in_pool = FALSE;
-      KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
-      KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
+      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
+      KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
     }
 
     // Decrement # of [worker] threads in the pool.
@@ -5849,6 +5851,13 @@
   }
 #endif /* KMP_AFFINITY_SUPPORTED */
 
+#if KMP_USE_HIER_SCHED
+  if (thread->th.th_hier_bar_data != NULL) {
+    __kmp_free(thread->th.th_hier_bar_data);
+    thread->th.th_hier_bar_data = NULL;
+  }
+#endif
+
   __kmp_reap_team(thread->th.th_serial_team);
   thread->th.th_serial_team = NULL;
   __kmp_free(thread);
@@ -5940,6 +5949,18 @@
 
     __kmp_reap_task_teams();
 
+#if KMP_OS_UNIX
+    // Threads that are not reaped should not access any resources since they
+    // are going to be deallocated soon, so the shutdown sequence should wait
+    // until all threads either exit the final spin-waiting loop or begin
+    // sleeping after the given blocktime.
+    for (i = 0; i < __kmp_threads_capacity; i++) {
+      kmp_info_t *thr = __kmp_threads[i];
+      while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
+        KMP_CPU_PAUSE();
+    }
+#endif
+
     for (i = 0; i < __kmp_threads_capacity; ++i) {
       // TBD: Add some checking...
       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
@@ -6273,7 +6294,7 @@
       if (tail != NULL) {
         long *flag_addr = 0;
         long flag_val = 0;
-        KMP_SSCANF(flag_addr_str, "%p", &flag_addr);
+        KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
           // First, check whether environment-encoded address is mapped into
@@ -7240,7 +7261,7 @@
   // executing thread (to become the master) are available to add to the new
   // team, but are currently contributing to the system load, and must be
   // accounted for.
-  pool_active = TCR_4(__kmp_thread_pool_active_nth);
+  pool_active = __kmp_thread_pool_active_nth;
   hot_team_active = __kmp_active_hot_team_nproc(root);
   team_curr_active = pool_active + hot_team_active + 1;
 
@@ -7364,6 +7385,10 @@
 
   __kmp_i18n_catclose();
 
+#if KMP_USE_HIER_SCHED
+  __kmp_hier_scheds.deallocate();
+#endif
+
 #if KMP_STATS_ENABLED
   __kmp_stats_fini();
 #endif
@@ -7614,7 +7639,6 @@
   } else {
 
     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
-    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
 
 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
 
@@ -7628,6 +7652,7 @@
       teamsize_cutoff = 8;
     }
 #endif
+    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
     if (tree_available) {
       if (team_size <= teamsize_cutoff) {
         if (atomic_available) {
@@ -7658,6 +7683,7 @@
 
 #elif KMP_OS_DARWIN
 
+    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
     if (atomic_available && (num_vars <= 3)) {
       retval = atomic_reduce_block;
     } else if (tree_available) {
diff --git a/runtime/src/kmp_sched.cpp b/runtime/src/kmp_sched.cpp
index 3700ee0..67fe02f 100644
--- a/runtime/src/kmp_sched.cpp
+++ b/runtime/src/kmp_sched.cpp
@@ -51,8 +51,9 @@
                                   void *codeptr
 #endif
                                   ) {
-  KMP_COUNT_BLOCK(OMP_FOR_static);
-  KMP_TIME_PARTITIONED_BLOCK(FOR_static_scheduling);
+  KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
+  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static);
+  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static_scheduling);
 
   typedef typename traits_t<T>::unsigned_t UT;
   typedef typename traits_t<T>::signed_t ST;
@@ -151,7 +152,6 @@
           &(task_info->task_data), 0, codeptr);
     }
 #endif
-    KMP_COUNT_VALUE(FOR_static_iterations, 0);
     return;
   }
 
@@ -254,7 +254,6 @@
                             loc);
     }
   }
-  KMP_COUNT_VALUE(FOR_static_iterations, trip_count);
 
   /* compute remaining parameters */
   switch (schedtype) {
@@ -390,6 +389,26 @@
   }
 #endif
 
+#if KMP_STATS_ENABLED
+  {
+    kmp_int64 t;
+    kmp_int64 u = (kmp_int64)(*pupper);
+    kmp_int64 l = (kmp_int64)(*plower);
+    kmp_int64 i = (kmp_int64)incr;
+    /* compute trip count */
+    if (i == 1) {
+      t = u - l + 1;
+    } else if (i == -1) {
+      t = l - u + 1;
+    } else if (i > 0) {
+      t = (u - l) / i + 1;
+    } else {
+      t = (l - u) / (-i) + 1;
+    }
+    KMP_COUNT_VALUE(OMP_loop_static_iterations, t);
+    KMP_POP_PARTITIONED_TIMER();
+  }
+#endif
   return;
 }
 
@@ -456,7 +475,7 @@
   nteams = th->th.th_teams_size.nteams;
 #endif
   team_id = team->t.t_master_tid;
-  KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
+  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
 
   // compute global trip count
   if (incr == 1) {
@@ -698,7 +717,7 @@
   nteams = th->th.th_teams_size.nteams;
 #endif
   team_id = team->t.t_master_tid;
-  KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
+  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
 
   // compute trip count
   if (incr == 1) {
diff --git a/runtime/src/kmp_settings.cpp b/runtime/src/kmp_settings.cpp
index d2502d0..72b7794 100644
--- a/runtime/src/kmp_settings.cpp
+++ b/runtime/src/kmp_settings.cpp
@@ -14,6 +14,9 @@
 #include "kmp.h"
 #include "kmp_affinity.h"
 #include "kmp_atomic.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
 #include "kmp_environment.h"
 #include "kmp_i18n.h"
 #include "kmp_io.h"
@@ -31,6 +34,7 @@
 // -----------------------------------------------------------------------------
 // Helper string functions. Subject to move to kmp_str.
 
+#ifdef USE_LOAD_BALANCE
 static double __kmp_convert_to_double(char const *s) {
   double result;
 
@@ -40,6 +44,7 @@
 
   return result;
 }
+#endif
 
 #ifdef KMP_DEBUG
 static unsigned int __kmp_readstr_with_sentinel(char *dest, char const *src,
@@ -129,6 +134,7 @@
   return TRUE;
 }
 
+#if KMP_OS_DARWIN
 static size_t __kmp_round4k(size_t size) {
   size_t _4k = 4 * 1024;
   if (size & (_4k - 1)) {
@@ -139,6 +145,7 @@
   }
   return size;
 } // __kmp_round4k
+#endif
 
 /* Here, multipliers are like __kmp_convert_to_seconds, but floating-point
    values are allowed, and the return value is in milliseconds.  The default
@@ -382,7 +389,7 @@
 
 #if KMP_DEBUG_ADAPTIVE_LOCKS
 static void __kmp_stg_parse_file(char const *name, char const *value,
-                                 char *suffix, char **out) {
+                                 const char *suffix, char **out) {
   char buffer[256];
   char *t;
   int hasSuffix;
@@ -529,6 +536,7 @@
   }
 } // __kmp_stg_print_int
 
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
 static void __kmp_stg_print_uint64(kmp_str_buf_t *buffer, char const *name,
                                    kmp_uint64 value) {
   if (__kmp_env_format) {
@@ -537,6 +545,7 @@
     __kmp_str_buf_print(buffer, "   %s=%" KMP_UINT64_SPEC "\n", name, value);
   }
 } // __kmp_stg_print_uint64
+#endif
 
 static void __kmp_stg_print_str(kmp_str_buf_t *buffer, char const *name,
                                 char const *value) {
@@ -1946,7 +1955,6 @@
   // Guards.
   int type = 0;
   int proclist = 0;
-  int max_proclist = 0;
   int verbose = 0;
   int warnings = 0;
   int respect = 0;
@@ -2636,7 +2644,7 @@
   const char *next = scan;
 
   for (;;) {
-    int start, count, stride;
+    int count, stride;
 
     if (!__kmp_parse_place(var, &scan)) {
       return FALSE;
@@ -3425,72 +3433,152 @@
 // -----------------------------------------------------------------------------
 // OMP_SCHEDULE
 
+static inline void __kmp_omp_schedule_restore() {
+#if KMP_USE_HIER_SCHED
+  __kmp_hier_scheds.deallocate();
+#endif
+  __kmp_chunk = 0;
+  __kmp_sched = kmp_sch_default;
+}
+
+static const char *__kmp_parse_single_omp_schedule(const char *name,
+                                                   const char *value,
+                                                   bool parse_hier = false) {
+  /* get the specified scheduling style */
+  const char *ptr = value;
+  const char *comma = strchr(ptr, ',');
+  const char *delim;
+  int chunk = 0;
+  enum sched_type sched = kmp_sch_default;
+  if (*ptr == '\0')
+    return NULL;
+#if KMP_USE_HIER_SCHED
+  kmp_hier_layer_e layer = kmp_hier_layer_e::LAYER_THREAD;
+  if (parse_hier) {
+    if (!__kmp_strcasecmp_with_sentinel("L1", ptr, ',')) {
+      layer = kmp_hier_layer_e::LAYER_L1;
+    } else if (!__kmp_strcasecmp_with_sentinel("L2", ptr, ',')) {
+      layer = kmp_hier_layer_e::LAYER_L2;
+    } else if (!__kmp_strcasecmp_with_sentinel("L3", ptr, ',')) {
+      layer = kmp_hier_layer_e::LAYER_L3;
+    } else if (!__kmp_strcasecmp_with_sentinel("NUMA", ptr, ',')) {
+      layer = kmp_hier_layer_e::LAYER_NUMA;
+    }
+    if (layer != kmp_hier_layer_e::LAYER_THREAD && !comma) {
+      // If there is no comma after the layer, then this schedule is invalid
+      KMP_WARNING(StgInvalidValue, name, value);
+      __kmp_omp_schedule_restore();
+      return NULL;
+    } else if (layer != kmp_hier_layer_e::LAYER_THREAD) {
+      ptr = ++comma;
+      comma = strchr(ptr, ',');
+    }
+  }
+  delim = ptr;
+  while (*delim != ',' && *delim != ':' && *delim != '\0')
+    delim++;
+#else // KMP_USE_HIER_SCHED
+  delim = ptr;
+  while (*delim != ',' && *delim != '\0')
+    delim++;
+#endif // KMP_USE_HIER_SCHED
+  if (!__kmp_strcasecmp_with_sentinel("dynamic", ptr, *delim)) /* DYNAMIC */
+    sched = kmp_sch_dynamic_chunked;
+  else if (!__kmp_strcasecmp_with_sentinel("guided", ptr, *delim)) /* GUIDED */
+    sched = kmp_sch_guided_chunked;
+  // AC: TODO: add AUTO schedule, and probably remove TRAPEZOIDAL (OMP 3.0 does
+  // not allow it)
+  else if (!__kmp_strcasecmp_with_sentinel("auto", ptr, *delim)) { /* AUTO */
+    sched = kmp_sch_auto;
+    if (comma) {
+      __kmp_msg(kmp_ms_warning, KMP_MSG(IgnoreChunk, name, comma),
+                __kmp_msg_null);
+      comma = NULL;
+    }
+  } else if (!__kmp_strcasecmp_with_sentinel("trapezoidal", ptr,
+                                             *delim)) /* TRAPEZOIDAL */
+    sched = kmp_sch_trapezoidal;
+  else if (!__kmp_strcasecmp_with_sentinel("static", ptr, *delim)) /* STATIC */
+    sched = kmp_sch_static;
+#if KMP_STATIC_STEAL_ENABLED
+  else if (!__kmp_strcasecmp_with_sentinel("static_steal", ptr, *delim))
+    sched = kmp_sch_static_steal;
+#endif
+  else {
+    KMP_WARNING(StgInvalidValue, name, value);
+    __kmp_omp_schedule_restore();
+    return NULL;
+  }
+  if (ptr && comma && *comma == *delim) {
+    ptr = comma + 1;
+    SKIP_DIGITS(ptr);
+
+    if (sched == kmp_sch_static)
+      sched = kmp_sch_static_chunked;
+    ++comma;
+    chunk = __kmp_str_to_int(comma, *ptr);
+    if (chunk < 1) {
+      chunk = KMP_DEFAULT_CHUNK;
+      __kmp_msg(kmp_ms_warning, KMP_MSG(InvalidChunk, name, comma),
+                __kmp_msg_null);
+      KMP_INFORM(Using_int_Value, name, __kmp_chunk);
+      // AC: next block commented out until KMP_DEFAULT_CHUNK != KMP_MIN_CHUNK
+      // (to improve code coverage :)
+      //     The default chunk size is 1 according to standard, thus making
+      //     KMP_MIN_CHUNK not 1 we would introduce mess:
+      //     wrong chunk becomes 1, but it will be impossible to explicitely set
+      //     1, because it becomes KMP_MIN_CHUNK...
+      //                } else if ( chunk < KMP_MIN_CHUNK ) {
+      //                    chunk = KMP_MIN_CHUNK;
+    } else if (chunk > KMP_MAX_CHUNK) {
+      chunk = KMP_MAX_CHUNK;
+      __kmp_msg(kmp_ms_warning, KMP_MSG(LargeChunk, name, comma),
+                __kmp_msg_null);
+      KMP_INFORM(Using_int_Value, name, chunk);
+    }
+  } else if (ptr) {
+    SKIP_TOKEN(ptr);
+  }
+#if KMP_USE_HIER_SCHED
+  if (layer != kmp_hier_layer_e::LAYER_THREAD) {
+    __kmp_hier_scheds.append(sched, chunk, layer);
+  } else
+#endif
+  {
+    __kmp_chunk = chunk;
+    __kmp_sched = sched;
+  }
+  return ptr;
+}
+
 static void __kmp_stg_parse_omp_schedule(char const *name, char const *value,
                                          void *data) {
   size_t length;
+  const char *ptr = value;
+  SKIP_WS(ptr);
   if (value) {
     length = KMP_STRLEN(value);
     if (length) {
-      const char *comma = strchr(value, ',');
       if (value[length - 1] == '"' || value[length - 1] == '\'')
         KMP_WARNING(UnbalancedQuotes, name);
-      /* get the specified scheduling style */
-      if (!__kmp_strcasecmp_with_sentinel("dynamic", value, ',')) /* DYNAMIC */
-        __kmp_sched = kmp_sch_dynamic_chunked;
-      else if (!__kmp_strcasecmp_with_sentinel("guided", value,
-                                               ',')) /* GUIDED */
-        __kmp_sched = kmp_sch_guided_chunked;
-      // AC: TODO: add AUTO schedule, and pprobably remove TRAPEZOIDAL (OMP 3.0
-      // does not allow it)
-      else if (!__kmp_strcasecmp_with_sentinel("auto", value, ',')) { /* AUTO */
-        __kmp_sched = kmp_sch_auto;
-        if (comma) {
-          __kmp_msg(kmp_ms_warning, KMP_MSG(IgnoreChunk, name, comma),
-                    __kmp_msg_null);
-          comma = NULL;
+/* get the specified scheduling style */
+#if KMP_USE_HIER_SCHED
+      if (!__kmp_strcasecmp_with_sentinel("EXPERIMENTAL", ptr, ' ')) {
+        SKIP_TOKEN(ptr);
+        SKIP_WS(ptr);
+        while ((ptr = __kmp_parse_single_omp_schedule(name, ptr, true))) {
+          while (*ptr == ' ' || *ptr == '\t' || *ptr == ':')
+            ptr++;
         }
-      } else if (!__kmp_strcasecmp_with_sentinel("trapezoidal", value,
-                                                 ',')) /* TRAPEZOIDAL */
-        __kmp_sched = kmp_sch_trapezoidal;
-      else if (!__kmp_strcasecmp_with_sentinel("static", value,
-                                               ',')) /* STATIC */
-        __kmp_sched = kmp_sch_static;
-#if KMP_STATIC_STEAL_ENABLED
-      else if (!__kmp_strcasecmp_with_sentinel("static_steal", value, ','))
-        __kmp_sched = kmp_sch_static_steal;
+      } else
 #endif
-      else {
-        KMP_WARNING(StgInvalidValue, name, value);
-        value = NULL; /* skip processing of comma */
-      }
-      if (value && comma) {
-        if (__kmp_sched == kmp_sch_static)
-          __kmp_sched = kmp_sch_static_chunked;
-        ++comma;
-        __kmp_chunk = __kmp_str_to_int(comma, 0);
-        if (__kmp_chunk < 1) {
-          __kmp_chunk = KMP_DEFAULT_CHUNK;
-          __kmp_msg(kmp_ms_warning, KMP_MSG(InvalidChunk, name, comma),
-                    __kmp_msg_null);
-          KMP_INFORM(Using_int_Value, name, __kmp_chunk);
-          // AC: next block commented out until KMP_DEFAULT_CHUNK !=
-          // KMP_MIN_CHUNK (to improve code coverage :)
-          //     The default chunk size is 1 according to standard, thus making
-          //     KMP_MIN_CHUNK not 1 we would introduce mess:
-          //     wrong chunk becomes 1, but it will be impossible to explicitely
-          //     set 1, because it becomes KMP_MIN_CHUNK...
-          //                } else if ( __kmp_chunk < KMP_MIN_CHUNK ) {
-          //                    __kmp_chunk = KMP_MIN_CHUNK;
-        } else if (__kmp_chunk > KMP_MAX_CHUNK) {
-          __kmp_chunk = KMP_MAX_CHUNK;
-          __kmp_msg(kmp_ms_warning, KMP_MSG(LargeChunk, name, comma),
-                    __kmp_msg_null);
-          KMP_INFORM(Using_int_Value, name, __kmp_chunk);
-        }
-      }
+        __kmp_parse_single_omp_schedule(name, ptr);
     } else
       KMP_WARNING(EmptyString, name);
   }
+#if KMP_USE_HIER_SCHED
+  __kmp_hier_scheds.sort();
+#endif
   K_DIAG(1, ("__kmp_static == %d\n", __kmp_static))
   K_DIAG(1, ("__kmp_guided == %d\n", __kmp_guided))
   K_DIAG(1, ("__kmp_sched == %d\n", __kmp_sched))
@@ -3557,6 +3645,20 @@
   }
 } // __kmp_stg_print_omp_schedule
 
+#if KMP_USE_HIER_SCHED
+// -----------------------------------------------------------------------------
+// KMP_DISP_HAND_THREAD
+static void __kmp_stg_parse_kmp_hand_thread(char const *name, char const *value,
+                                            void *data) {
+  __kmp_stg_parse_bool(name, value, &(__kmp_dispatch_hand_threading));
+} // __kmp_stg_parse_kmp_hand_thread
+
+static void __kmp_stg_print_kmp_hand_thread(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_dispatch_hand_threading);
+} // __kmp_stg_print_kmp_hand_thread
+#endif
+
 // -----------------------------------------------------------------------------
 // KMP_ATOMIC_MODE
 
@@ -4139,7 +4241,7 @@
 static void __kmp_stg_parse_speculative_statsfile(char const *name,
                                                   char const *value,
                                                   void *data) {
-  __kmp_stg_parse_file(name, value, "", &__kmp_speculative_statsfile);
+  __kmp_stg_parse_file(name, value, "", CCAST(char**, &__kmp_speculative_statsfile));
 } // __kmp_stg_parse_speculative_statsfile
 
 static void __kmp_stg_print_speculative_statsfile(kmp_str_buf_t *buffer,
@@ -4626,6 +4728,10 @@
      0, 0},
     {"OMP_SCHEDULE", __kmp_stg_parse_omp_schedule, __kmp_stg_print_omp_schedule,
      NULL, 0, 0},
+#if KMP_USE_HIER_SCHED
+    {"KMP_DISP_HAND_THREAD", __kmp_stg_parse_kmp_hand_thread,
+     __kmp_stg_print_kmp_hand_thread, NULL, 0, 0},
+#endif
     {"KMP_ATOMIC_MODE", __kmp_stg_parse_atomic_mode,
      __kmp_stg_print_atomic_mode, NULL, 0, 0},
     {"KMP_CONSISTENCY_CHECK", __kmp_stg_parse_consistency_check,
diff --git a/runtime/src/kmp_stats.cpp b/runtime/src/kmp_stats.cpp
index d75695f..2c0eabe 100644
--- a/runtime/src/kmp_stats.cpp
+++ b/runtime/src/kmp_stats.cpp
@@ -21,6 +21,7 @@
 #include <iomanip>
 #include <sstream>
 #include <stdlib.h> // for atexit
+#include <cmath>
 
 #define STRINGIZE2(x) #x
 #define STRINGIZE(x) STRINGIZE2(x)
@@ -67,9 +68,18 @@
 // output interface
 static kmp_stats_output_module *__kmp_stats_global_output = NULL;
 
+double logHistogram::binMax[] = {
+    1.e1l,  1.e2l,  1.e3l,  1.e4l,  1.e5l,  1.e6l,  1.e7l,  1.e8l,
+    1.e9l,  1.e10l, 1.e11l, 1.e12l, 1.e13l, 1.e14l, 1.e15l, 1.e16l,
+    1.e17l, 1.e18l, 1.e19l, 1.e20l, 1.e21l, 1.e22l, 1.e23l, 1.e24l,
+    1.e25l, 1.e26l, 1.e27l, 1.e28l, 1.e29l, 1.e30l};
+
 /* ************* statistic member functions ************* */
 
 void statistic::addSample(double sample) {
+  sample -= offset;
+  KMP_DEBUG_ASSERT(std::isfinite(sample));
+
   double delta = sample - meanVal;
 
   sampleCount = sampleCount + 1;
@@ -78,9 +88,14 @@
 
   minVal = std::min(minVal, sample);
   maxVal = std::max(maxVal, sample);
+  if (collectingHist)
+    hist.addSample(sample);
 }
 
 statistic &statistic::operator+=(const statistic &other) {
+  if (other.sampleCount == 0)
+    return *this;
+
   if (sampleCount == 0) {
     *this = other;
     return *this;
@@ -104,6 +119,8 @@
   minVal = std::min(minVal, other.minVal);
   maxVal = std::max(maxVal, other.maxVal);
   sampleCount = newSampleCount;
+  if (collectingHist)
+    hist += other.hist;
 
   return *this;
 }
@@ -138,10 +155,89 @@
   return result;
 }
 
+/* ************* histogram member functions ************* */
+
+// Lowest bin that has anything in it
+int logHistogram::minBin() const {
+  for (int i = 0; i < numBins; i++) {
+    if (bins[i].count != 0)
+      return i - logOffset;
+  }
+  return -logOffset;
+}
+
+// Highest bin that has anything in it
+int logHistogram::maxBin() const {
+  for (int i = numBins - 1; i >= 0; i--) {
+    if (bins[i].count != 0)
+      return i - logOffset;
+  }
+  return -logOffset;
+}
+
+// Which bin does this sample belong in ?
+uint32_t logHistogram::findBin(double sample) {
+  double v = std::fabs(sample);
+  // Simply loop up looking which bin to put it in.
+  // According to a micro-architect this is likely to be faster than a binary
+  // search, since
+  // it will only have one branch mis-predict
+  for (int b = 0; b < numBins; b++)
+    if (binMax[b] > v)
+      return b;
+  fprintf(stderr,
+          "Trying to add a sample that is too large into a histogram\n");
+  KMP_ASSERT(0);
+  return -1;
+}
+
+void logHistogram::addSample(double sample) {
+  if (sample == 0.0) {
+    zeroCount += 1;
+#ifdef KMP_DEBUG
+    _total++;
+    check();
+#endif
+    return;
+  }
+  KMP_DEBUG_ASSERT(std::isfinite(sample));
+  uint32_t bin = findBin(sample);
+  KMP_DEBUG_ASSERT(0 <= bin && bin < numBins);
+
+  bins[bin].count += 1;
+  bins[bin].total += sample;
+#ifdef KMP_DEBUG
+  _total++;
+  check();
+#endif
+}
+
+// This may not be the format we want, but it'll do for now
+std::string logHistogram::format(char unit) const {
+  std::stringstream result;
+
+  result << "Bin,                Count,     Total\n";
+  if (zeroCount) {
+    result << "0,              " << formatSI(zeroCount, 9, ' ') << ", ",
+        formatSI(0.0, 9, unit);
+    if (count(minBin()) == 0)
+      return result.str();
+    result << "\n";
+  }
+  for (int i = minBin(); i <= maxBin(); i++) {
+    result << "10**" << i << "<=v<10**" << (i + 1) << ", "
+           << formatSI(count(i), 9, ' ') << ", " << formatSI(total(i), 9, unit);
+    if (i != maxBin())
+      result << "\n";
+  }
+
+  return result.str();
+}
+
 /* ************* explicitTimer member functions ************* */
 
-void explicitTimer::start(timer_e timerEnumValue) {
-  startTime = tsc_tick_count::now();
+void explicitTimer::start(tsc_tick_count tick) {
+  startTime = tick;
   totalPauseTime = 0;
   if (timeStat::logEvent(timerEnumValue)) {
     __kmp_stats_thread_ptr->incrementNestValue();
@@ -149,22 +245,19 @@
   return;
 }
 
-void explicitTimer::stop(timer_e timerEnumValue,
+void explicitTimer::stop(tsc_tick_count tick,
                          kmp_stats_list *stats_ptr /* = nullptr */) {
   if (startTime.getValue() == 0)
     return;
 
-  tsc_tick_count finishTime = tsc_tick_count::now();
-
-  // stat->addSample ((tsc_tick_count::now() - startTime).ticks());
-  stat->addSample(((finishTime - startTime) - totalPauseTime).ticks());
+  stat->addSample(((tick - startTime) - totalPauseTime).ticks());
 
   if (timeStat::logEvent(timerEnumValue)) {
     if (!stats_ptr)
       stats_ptr = __kmp_stats_thread_ptr;
     stats_ptr->push_event(
         startTime.getValue() - __kmp_stats_start_time.getValue(),
-        finishTime.getValue() - __kmp_stats_start_time.getValue(),
+        tick.getValue() - __kmp_stats_start_time.getValue(),
         __kmp_stats_thread_ptr->getNestValue(), timerEnumValue);
     stats_ptr->decrementNestValue();
   }
@@ -178,52 +271,67 @@
 /* ************* partitionedTimers member functions ************* */
 partitionedTimers::partitionedTimers() { timer_stack.reserve(8); }
 
-// add a timer to this collection of partitioned timers.
-void partitionedTimers::add_timer(explicit_timer_e timer_index,
-                                  explicitTimer *timer_pointer) {
-  KMP_DEBUG_ASSERT((int)timer_index < (int)EXPLICIT_TIMER_LAST + 1);
-  timers[timer_index] = timer_pointer;
-}
-
 // initialize the paritioned timers to an initial timer
-void partitionedTimers::init(timerPair init_timer_pair) {
+void partitionedTimers::init(explicitTimer timer) {
   KMP_DEBUG_ASSERT(this->timer_stack.size() == 0);
-  timer_stack.push_back(init_timer_pair);
-  timers[init_timer_pair.get_index()]->start(init_timer_pair.get_timer());
+  timer_stack.push_back(timer);
+  timer_stack.back().start(tsc_tick_count::now());
 }
 
 // stop/save the current timer, and start the new timer (timer_pair)
 // There is a special condition where if the current timer is equal to
 // the one you are trying to push, then it only manipulates the stack,
 // and it won't stop/start the currently running timer.
-void partitionedTimers::push(timerPair timer_pair) {
+void partitionedTimers::push(explicitTimer timer) {
   // get the current timer
-  // stop current timer
+  // pause current timer
   // push new timer
   // start the new timer
+  explicitTimer *current_timer, *new_timer;
+  size_t stack_size;
   KMP_DEBUG_ASSERT(this->timer_stack.size() > 0);
-  timerPair current_timer = timer_stack.back();
-  timer_stack.push_back(timer_pair);
-  if (current_timer != timer_pair) {
-    timers[current_timer.get_index()]->pause();
-    timers[timer_pair.get_index()]->start(timer_pair.get_timer());
-  }
+  timer_stack.push_back(timer);
+  stack_size = timer_stack.size();
+  current_timer = &(timer_stack[stack_size - 2]);
+  new_timer = &(timer_stack[stack_size - 1]);
+  tsc_tick_count tick = tsc_tick_count::now();
+  current_timer->pause(tick);
+  new_timer->start(tick);
 }
 
 // stop/discard the current timer, and start the previously saved timer
 void partitionedTimers::pop() {
   // get the current timer
-  // stop current timer
+  // stop current timer (record event/sample)
   // pop current timer
-  // get the new current timer and start it back up
-  KMP_DEBUG_ASSERT(this->timer_stack.size() > 1);
-  timerPair current_timer = timer_stack.back();
+  // get the new current timer and resume
+  explicitTimer *old_timer, *new_timer;
+  size_t stack_size = timer_stack.size();
+  KMP_DEBUG_ASSERT(stack_size > 1);
+  old_timer = &(timer_stack[stack_size - 1]);
+  new_timer = &(timer_stack[stack_size - 2]);
+  tsc_tick_count tick = tsc_tick_count::now();
+  old_timer->stop(tick);
+  new_timer->resume(tick);
   timer_stack.pop_back();
-  timerPair new_timer = timer_stack.back();
-  if (current_timer != new_timer) {
-    timers[current_timer.get_index()]->stop(current_timer.get_timer());
-    timers[new_timer.get_index()]->resume();
-  }
+}
+
+void partitionedTimers::exchange(explicitTimer timer) {
+  // get the current timer
+  // stop current timer (record event/sample)
+  // push new timer
+  // start the new timer
+  explicitTimer *current_timer, *new_timer;
+  size_t stack_size;
+  KMP_DEBUG_ASSERT(this->timer_stack.size() > 0);
+  tsc_tick_count tick = tsc_tick_count::now();
+  stack_size = timer_stack.size();
+  current_timer = &(timer_stack[stack_size - 1]);
+  current_timer->stop(tick);
+  timer_stack.pop_back();
+  timer_stack.push_back(timer);
+  new_timer = &(timer_stack[stack_size - 1]);
+  new_timer->start(tick);
 }
 
 // Wind up all the currently running timers.
@@ -234,10 +342,10 @@
   while (timer_stack.size() > 1) {
     this->pop();
   }
+  // Pop the timer from the init() call
   if (timer_stack.size() > 0) {
-    timerPair last_timer = timer_stack.back();
+    timer_stack.back().stop(tsc_tick_count::now());
     timer_stack.pop_back();
-    timers[last_timer.get_index()]->stop(last_timer.get_timer());
   }
 }
 
@@ -255,8 +363,8 @@
 // event2 or zero if event1 == event2. This sorts by start time (lowest to
 // highest).
 int compare_two_events(const void *event1, const void *event2) {
-  kmp_stats_event *ev1 = (kmp_stats_event *)event1;
-  kmp_stats_event *ev2 = (kmp_stats_event *)event2;
+  const kmp_stats_event *ev1 = RCAST(const kmp_stats_event *, event1);
+  const kmp_stats_event *ev2 = RCAST(const kmp_stats_event *, event2);
 
   if (ev1->getStart() < ev2->getStart())
     return -1;
@@ -353,9 +461,93 @@
 int kmp_stats_output_module::printPerThreadFlag = 0;
 int kmp_stats_output_module::printPerThreadEventsFlag = 0;
 
+static char const *lastName(char *name) {
+  int l = strlen(name);
+  for (int i = l - 1; i >= 0; --i) {
+    if (name[i] == '.')
+      name[i] = '_';
+    if (name[i] == '/')
+      return name + i + 1;
+  }
+  return name;
+}
+
+/* Read the name of the executable from /proc/self/cmdline */
+static char const *getImageName(char *buffer, size_t buflen) {
+  FILE *f = fopen("/proc/self/cmdline", "r");
+  buffer[0] = char(0);
+  if (!f)
+    return buffer;
+
+  // The file contains char(0) delimited words from the commandline.
+  // This just returns the last filename component of the first word on the
+  // line.
+  size_t n = fread(buffer, 1, buflen, f);
+  if (n == 0) {
+    fclose(f);
+    KMP_CHECK_SYSFAIL("fread", 1)
+  }
+  fclose(f);
+  buffer[buflen - 1] = char(0);
+  return lastName(buffer);
+}
+
+static void getTime(char *buffer, size_t buflen, bool underscores = false) {
+  time_t timer;
+
+  time(&timer);
+
+  struct tm *tm_info = localtime(&timer);
+  if (underscores)
+    strftime(buffer, buflen, "%Y-%m-%d_%H%M%S", tm_info);
+  else
+    strftime(buffer, buflen, "%Y-%m-%d %H%M%S", tm_info);
+}
+
+/* Generate a stats file name, expanding prototypes */
+static std::string generateFilename(char const *prototype,
+                                    char const *imageName) {
+  std::string res;
+
+  for (int i = 0; prototype[i] != char(0); i++) {
+    char ch = prototype[i];
+
+    if (ch == '%') {
+      i++;
+      if (prototype[i] == char(0))
+        break;
+
+      switch (prototype[i]) {
+      case 't': // Insert time and date
+      {
+        char date[26];
+        getTime(date, sizeof(date), true);
+        res += date;
+      } break;
+      case 'e': // Insert executable name
+        res += imageName;
+        break;
+      case 'p': // Insert pid
+      {
+        std::stringstream ss;
+        ss << getpid();
+        res += ss.str();
+      } break;
+      default:
+        res += prototype[i];
+        break;
+      }
+    } else
+      res += ch;
+  }
+  return res;
+}
+
 // init() is called very near the beginning of execution time in the constructor
 // of __kmp_stats_global_output
 void kmp_stats_output_module::init() {
+
+  fprintf(stderr, "*** Stats enabled OpenMP* runtime ***\n");
   char *statsFileName = getenv("KMP_STATS_FILE");
   eventsFileName = getenv("KMP_STATS_EVENTS_FILE");
   plotFileName = getenv("KMP_STATS_PLOT_FILE");
@@ -364,22 +556,10 @@
 
   // set the stats output filenames based on environment variables and defaults
   if (statsFileName) {
-    // append the process id to the output filename
-    // events.csv --> events-pid.csv
-    size_t index;
-    std::string baseFileName, pid, suffix;
-    std::stringstream ss;
-    outputFileName = std::string(statsFileName);
-    index = outputFileName.find_last_of('.');
-    if (index == std::string::npos) {
-      baseFileName = outputFileName;
-    } else {
-      baseFileName = outputFileName.substr(0, index);
-      suffix = outputFileName.substr(index);
-    }
-    ss << getpid();
-    pid = ss.str();
-    outputFileName = baseFileName + "-" + pid + suffix;
+    char imageName[1024];
+    // Process any escapes (e.g., %p, %e, %t) in the name
+    outputFileName = generateFilename(
+        statsFileName, getImageName(&imageName[0], sizeof(imageName)));
   }
   eventsFileName = eventsFileName ? eventsFileName : "events.dat";
   plotFileName = plotFileName ? plotFileName : "events.plt";
@@ -396,8 +576,6 @@
     // will clear flag so that no event will be logged
     timeStat::clearEventFlags();
   }
-
-  return;
 }
 
 void kmp_stats_output_module::setupEventColors() {
@@ -410,28 +588,43 @@
       globalColorIndex = (globalColorIndex + 1) % numGlobalColors;
     }
   }
-  return;
 }
 
 void kmp_stats_output_module::printTimerStats(FILE *statsOut,
                                               statistic const *theStats,
                                               statistic const *totalStats) {
-  fprintf(statsOut, "Timer,                      SampleCount,    Min,      "
-                    "Mean,       Max,     Total,        SD\n");
+  fprintf(statsOut,
+          "Timer,                             SampleCount,    Min,      "
+          "Mean,       Max,     Total,        SD\n");
   for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) {
     statistic const *stat = &theStats[s];
     char tag = timeStat::noUnits(s) ? ' ' : 'T';
 
-    fprintf(statsOut, "%-28s, %s\n", timeStat::name(s),
+    fprintf(statsOut, "%-35s, %s\n", timeStat::name(s),
             stat->format(tag, true).c_str());
   }
   // Also print the Total_ versions of times.
   for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) {
     char tag = timeStat::noUnits(s) ? ' ' : 'T';
     if (totalStats && !timeStat::noTotal(s))
-      fprintf(statsOut, "Total_%-22s, %s\n", timeStat::name(s),
+      fprintf(statsOut, "Total_%-29s, %s\n", timeStat::name(s),
               totalStats[s].format(tag, true).c_str());
   }
+
+  // Print historgram of statistics
+  if (theStats[0].haveHist()) {
+    fprintf(statsOut, "\nTimer distributions\n");
+    for (int s = 0; s < TIMER_LAST; s++) {
+      statistic const *stat = &theStats[s];
+
+      if (stat->getCount() != 0) {
+        char tag = timeStat::noUnits(timer_e(s)) ? ' ' : 'T';
+
+        fprintf(statsOut, "%s\n", timeStat::name(timer_e(s)));
+        fprintf(statsOut, "%s\n", stat->getHist()->format(tag).c_str());
+      }
+    }
+  }
 }
 
 void kmp_stats_output_module::printCounterStats(FILE *statsOut,
@@ -443,6 +636,18 @@
     fprintf(statsOut, "%-25s, %s\n", counter::name(counter_e(s)),
             stat->format(' ', true).c_str());
   }
+  // Print histogram of counters
+  if (theStats[0].haveHist()) {
+    fprintf(statsOut, "\nCounter distributions\n");
+    for (int s = 0; s < COUNTER_LAST; s++) {
+      statistic const *stat = &theStats[s];
+
+      if (stat->getCount() != 0) {
+        fprintf(statsOut, "%s\n", counter::name(counter_e(s)));
+        fprintf(statsOut, "%s\n", stat->getHist()->format(' ').c_str());
+      }
+    }
+  }
 }
 
 void kmp_stats_output_module::printCounters(FILE *statsOut,
@@ -480,9 +685,7 @@
   for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) {
     kmp_stats_list *ptr = *it;
     ptr->getPartitionedTimers()->windup();
-    for (int timer = 0; timer < EXPLICIT_TIMER_LAST; timer++) {
-      ptr->getExplicitTimer(explicit_timer_e(timer))->stop((timer_e)timer, ptr);
-    }
+    ptr->endLife();
   }
 }
 
@@ -547,6 +750,11 @@
   return;
 }
 
+static void outputEnvVariable(FILE *statsOut, char const *name) {
+  char const *value = getenv(name);
+  fprintf(statsOut, "# %s = %s\n", name, value ? value : "*unspecified*");
+}
+
 /* Print some useful information about
    * the date and time this experiment ran.
    * the machine on which it ran.
@@ -570,6 +778,11 @@
   else
     fprintf(statsOut, "# Nominal frequency: %sz\n",
             formatSI(double(__kmp_cpuinfo.frequency), 9, 'H').c_str());
+  outputEnvVariable(statsOut, "KMP_HW_SUBSET");
+  outputEnvVariable(statsOut, "KMP_AFFINITY");
+  outputEnvVariable(statsOut, "KMP_BLOCKTIME");
+  outputEnvVariable(statsOut, "KMP_LIBRARY");
+  fprintf(statsOut, "# Production runtime built " __DATE__ " " __TIME__ "\n");
 #endif
 }
 
@@ -665,7 +878,6 @@
   for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) {
     timeStat *timers = (*it)->getTimers();
     counter *counters = (*it)->getCounters();
-    explicitTimer *eTimers = (*it)->getExplicitTimers();
 
     for (int t = 0; t < TIMER_LAST; t++)
       timers[t].reset();
@@ -673,9 +885,6 @@
     for (int c = 0; c < COUNTER_LAST; c++)
       counters[c].reset();
 
-    for (int t = 0; t < EXPLICIT_TIMER_LAST; t++)
-      eTimers[t].reset();
-
     // reset the event vector so all previous events are "erased"
     (*it)->resetEventVector();
   }
diff --git a/runtime/src/kmp_stats.h b/runtime/src/kmp_stats.h
index 63ce4a2..be94843 100644
--- a/runtime/src/kmp_stats.h
+++ b/runtime/src/kmp_stats.h
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "kmp_config.h"
+#include "kmp_debug.h"
 
 #if KMP_STATS_ENABLED
 /* Statistics accumulator.
@@ -36,7 +37,10 @@
 /* Enable developer statistics here if you want them. They are more detailed
    than is useful for application characterisation and are intended for the
    runtime library developer. */
-// #define KMP_DEVELOPER_STATS 1
+#define KMP_DEVELOPER_STATS 0
+
+/* Enable/Disable histogram output */
+#define KMP_STATS_HIST 0
 
 /*!
  * @ingroup STATS_GATHERING
@@ -46,8 +50,7 @@
 enum stats_flags_e {
   noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic
   onlyInMaster = 1 << 1, //!< statistic is valid only for master
-  noUnits =
-      1 << 2, //!< statistic doesn't need units printed next to it in output
+  noUnits = 1 << 2, //!< statistic doesn't need units printed next to it
   notInMaster = 1 << 3, //!< statistic is valid only for non-master threads
   logEvent = 1 << 4 //!< statistic can be logged on the event timeline when
   //! KMP_STATS_EVENTS is on (valid only for timers)
@@ -90,20 +93,26 @@
  */
 // clang-format off
 #define KMP_FOREACH_COUNTER(macro, arg)                                        \
-  macro(OMP_PARALLEL, stats_flags_e::onlyInMaster | stats_flags_e::noTotal,    \
-        arg) macro(OMP_NESTED_PARALLEL, 0, arg) macro(OMP_FOR_static, 0, arg)  \
-      macro(OMP_FOR_static_steal, 0, arg) macro(OMP_FOR_dynamic, 0, arg)       \
-          macro(OMP_DISTRIBUTE, 0, arg) macro(OMP_BARRIER, 0, arg)             \
-              macro(OMP_CRITICAL, 0, arg) macro(OMP_SINGLE, 0, arg)            \
-                  macro(OMP_MASTER, 0, arg) macro(OMP_TEAMS, 0, arg)           \
-                      macro(OMP_set_lock, 0, arg) macro(OMP_test_lock, 0, arg) \
-                          macro(REDUCE_wait, 0, arg)                           \
-                              macro(REDUCE_nowait, 0, arg)                     \
-                                  macro(OMP_TASKYIELD, 0, arg)                 \
-                                      macro(OMP_TASKLOOP, 0, arg)              \
-                                          macro(TASK_executed, 0, arg)         \
-                                              macro(TASK_cancelled, 0, arg)    \
-                                                  macro(TASK_stolen, 0, arg)
+  macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg)   \
+  macro(OMP_NESTED_PARALLEL, 0, arg)                                           \
+  macro(OMP_LOOP_STATIC, 0, arg)                                               \
+  macro(OMP_LOOP_STATIC_STEAL, 0, arg)                                         \
+  macro(OMP_LOOP_DYNAMIC, 0, arg)                                              \
+  macro(OMP_DISTRIBUTE, 0, arg)                                                \
+  macro(OMP_BARRIER, 0, arg)                                                   \
+  macro(OMP_CRITICAL, 0, arg)                                                  \
+  macro(OMP_SINGLE, 0, arg)                                                    \
+  macro(OMP_MASTER, 0, arg)                                                    \
+  macro(OMP_TEAMS, 0, arg)                                                     \
+  macro(OMP_set_lock, 0, arg)                                                  \
+  macro(OMP_test_lock, 0, arg)                                                 \
+  macro(REDUCE_wait, 0, arg)                                                   \
+  macro(REDUCE_nowait, 0, arg)                                                 \
+  macro(OMP_TASKYIELD, 0, arg)                                                 \
+  macro(OMP_TASKLOOP, 0, arg)                                                  \
+  macro(TASK_executed, 0, arg)                                                 \
+  macro(TASK_cancelled, 0, arg)                                                \
+  macro(TASK_stolen, 0, arg)
 // clang-format on
 
 /*!
@@ -114,7 +123,7 @@
  * @param arg a user defined argument to send to the user defined macro
  *
  * \details A timer collects multiple samples of some count in each thread and
- * then finally aggregates alll of the samples from all of the threads. For most
+ * then finally aggregates all of the samples from all of the threads. For most
  * timers the printing code also provides an aggregation over the thread totals.
  * These are printed as TOTAL_foo. The count is normally a time (in ticks),
  * hence the name "timer". (But can be any value, so we use this for "number of
@@ -126,55 +135,59 @@
  */
 // clang-format off
 #define KMP_FOREACH_TIMER(macro, arg)                                          \
-    macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg)               \
-    macro (FOR_static_scheduling, 0, arg)                                      \
-    macro (FOR_dynamic_scheduling, 0, arg)                                     \
-    macro (OMP_critical, 0, arg)                                               \
-    macro (OMP_critical_wait, 0, arg)                                          \
-    macro (OMP_single, 0, arg)                                                 \
-    macro (OMP_master, 0, arg)                                                 \
-    macro (OMP_idle, stats_flags_e::logEvent, arg)                             \
-    macro (OMP_plain_barrier, stats_flags_e::logEvent, arg)                    \
-    macro (OMP_fork_barrier, stats_flags_e::logEvent, arg)                     \
-    macro (OMP_join_barrier, stats_flags_e::logEvent, arg)                     \
-    macro (OMP_parallel, stats_flags_e::logEvent, arg)                         \
-    macro (OMP_task_immediate, 0, arg)                                         \
-    macro (OMP_task_taskwait, 0, arg)                                          \
-    macro (OMP_task_taskyield, 0, arg)                                         \
-    macro (OMP_task_taskgroup, 0, arg)                                         \
-    macro (OMP_task_join_bar, 0, arg)                                          \
-    macro (OMP_task_plain_bar, 0, arg)                                         \
-    macro (OMP_serial, stats_flags_e::logEvent, arg)                           \
-    macro (OMP_taskloop_scheduling, 0, arg)                                    \
-    macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal,\
-           arg)                                                                \
-    macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \
-           arg)                                                                \
-    macro (FOR_static_iterations,                                              \
-           stats_flags_e::noUnits | stats_flags_e::noTotal, arg)               \
-    macro (FOR_dynamic_iterations,                                             \
-           stats_flags_e::noUnits | stats_flags_e::noTotal, arg)               \
-    macro (FOR_static_steal_stolen,                                            \
-           stats_flags_e::noUnits | stats_flags_e::noTotal, arg)               \
-    macro (FOR_static_steal_chunks,                                            \
-           stats_flags_e::noUnits | stats_flags_e::noTotal, arg)               \
-    KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
+  macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg)                 \
+  macro (OMP_parallel, stats_flags_e::logEvent, arg)                           \
+  macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg)                  \
+  macro (OMP_loop_static, 0, arg)                                              \
+  macro (OMP_loop_static_scheduling, 0, arg)                                   \
+  macro (OMP_loop_dynamic, 0, arg)                                             \
+  macro (OMP_loop_dynamic_scheduling, 0, arg)                                  \
+  macro (OMP_critical, 0, arg)                                                 \
+  macro (OMP_critical_wait, 0, arg)                                            \
+  macro (OMP_single, 0, arg)                                                   \
+  macro (OMP_master, 0, arg)                                                   \
+  macro (OMP_task_immediate, 0, arg)                                           \
+  macro (OMP_task_taskwait, 0, arg)                                            \
+  macro (OMP_task_taskyield, 0, arg)                                           \
+  macro (OMP_task_taskgroup, 0, arg)                                           \
+  macro (OMP_task_join_bar, 0, arg)                                            \
+  macro (OMP_task_plain_bar, 0, arg)                                           \
+  macro (OMP_taskloop_scheduling, 0, arg)                                      \
+  macro (OMP_plain_barrier, stats_flags_e::logEvent, arg)                      \
+  macro (OMP_idle, stats_flags_e::logEvent, arg)                               \
+  macro (OMP_fork_barrier, stats_flags_e::logEvent, arg)                       \
+  macro (OMP_join_barrier, stats_flags_e::logEvent, arg)                       \
+  macro (OMP_serial, stats_flags_e::logEvent, arg)                             \
+  macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal,  \
+         arg)                                                                  \
+  macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal,   \
+         arg)                                                                  \
+  macro (OMP_loop_static_iterations,                                           \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  macro (OMP_loop_dynamic_iterations,                                          \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
 // clang-format on
 
-// OMP_start_end          -- Time from when OpenMP is initialized until the
-//                           stats are printed at exit
-// OMP_serial             -- Thread zero time executing serial code
-// OMP_work               -- Elapsed time in code dispatched by a fork (measured
-//                           in the thread)
-// OMP_barrier            -- Time at "real" barriers (includes task time)
-// FOR_static_scheduling  -- Time spent doing scheduling for a static "for"
-// FOR_dynamic_scheduling -- Time spent doing scheduling for a dynamic "for"
-// OMP_idle               -- Worker threads time spent waiting for inclusion in
-//                           a parallel region
-// OMP_plain_barrier      -- Time spent in a barrier construct
-// OMP_fork_join_barrier  -- Time spent in a the fork-join barrier surrounding a
-//                           parallel region
-// OMP_parallel           -- Time spent inside a parallel construct
+// OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
+//                           initializing OpenMP or being created by a master)
+//                           until the thread is destroyed
+// OMP_parallel           -- Time thread spends executing work directly
+//                           within a #pragma omp parallel
+// OMP_parallel_overhead  -- Time thread spends setting up a parallel region
+// OMP_loop_static        -- Time thread spends executing loop iterations from
+//                           a statically scheduled loop
+// OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
+//                               from a statically scheduled loop
+// OMP_loop_dynamic       -- Time thread spends executing loop iterations from
+//                           a dynamically scheduled loop
+// OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
+//                                from a dynamically scheduled loop
+// OMP_critical           -- Time thread spends executing critical section
+// OMP_critical_wait      -- Time thread spends waiting to enter
+//                           a critcal seciton
+// OMP_single             -- Time spent executing a "single" region
+// OMP_master             -- Time spent executing a "master" region
 // OMP_task_immediate     -- Time spent executing non-deferred tasks
 // OMP_task_taskwait      -- Time spent executing tasks inside a taskwait
 //                           construct
@@ -185,17 +198,24 @@
 // OMP_task_join_bar      -- Time spent executing tasks inside a join barrier
 // OMP_task_plain_bar     -- Time spent executing tasks inside a barrier
 //                           construct
-// OMP_single             -- Time spent executing a "single" region
-// OMP_master             -- Time spent executing a "master" region
+// OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
+//                            construct
+// OMP_plain_barrier      -- Time spent in a #pragma omp barrier construct or
+//                           inside implicit barrier at end of worksharing
+//                           construct
+// OMP_idle               -- Time worker threads spend waiting for next
+//                           parallel region
+// OMP_fork_barrier       -- Time spent in a the fork barrier surrounding a
+//                           parallel region
+// OMP_join_barrier       -- Time spent in a the join barrier surrounding a
+//                           parallel region
+// OMP_serial             -- Time thread zero spends executing serial code
 // OMP_set_numthreads     -- Values passed to omp_set_num_threads
 // OMP_PARALLEL_args      -- Number of arguments passed to a parallel region
-// FOR_static_iterations  -- Number of available parallel chunks of work in a
-//                           static for
-// FOR_dynamic_iterations -- Number of available parallel chunks of work in a
-//                           dynamic for
-//                           Both adjust for any chunking, so if there were an
-//                           iteration count of 20 but a chunk size of 10, we'd
-//                           record 2.
+// OMP_loop_static_iterations -- Number of iterations thread is assigned for
+//                               statically scheduled loops
+// OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
+//                                dynamically scheduled loops
 
 #if (KMP_DEVELOPER_STATS)
 // Timers which are of interest to runtime library developers, not end users.
@@ -213,20 +233,32 @@
 // KMP_tree_release       -- time in __kmp_tree_barrier_release
 // KMP_hyper_gather       -- time in __kmp_hyper_barrier_gather
 // KMP_hyper_release      -- time in __kmp_hyper_barrier_release
+// clang-format off
 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)                                \
-  macro(KMP_fork_call, 0, arg) macro(KMP_join_call, 0, arg) macro(             \
-      KMP_end_split_barrier, 0, arg) macro(KMP_hier_gather, 0, arg)            \
-      macro(KMP_hier_release, 0, arg) macro(KMP_hyper_gather, 0, arg)          \
-          macro(KMP_hyper_release, 0, arg) macro(KMP_linear_gather, 0, arg)    \
-              macro(KMP_linear_release, 0, arg) macro(KMP_tree_gather, 0, arg) \
-                  macro(KMP_tree_release, 0, arg) macro(USER_resume, 0, arg)   \
-                      macro(USER_suspend, 0, arg)                              \
-                          macro(KMP_allocate_team, 0, arg)                     \
-                              macro(KMP_setup_icv_copy, 0, arg)                \
-                                  macro(USER_icv_copy, 0, arg)
+  macro(KMP_fork_call, 0, arg)                                                 \
+  macro(KMP_join_call, 0, arg)                                                 \
+  macro(KMP_end_split_barrier, 0, arg)                                         \
+  macro(KMP_hier_gather, 0, arg)                                               \
+  macro(KMP_hier_release, 0, arg)                                              \
+  macro(KMP_hyper_gather, 0, arg)                                              \
+  macro(KMP_hyper_release, 0, arg)                                             \
+  macro(KMP_linear_gather, 0, arg)                                             \
+  macro(KMP_linear_release, 0, arg)                                            \
+  macro(KMP_tree_gather, 0, arg)                                               \
+  macro(KMP_tree_release, 0, arg)                                              \
+  macro(USER_resume, 0, arg)                                                   \
+  macro(USER_suspend, 0, arg)                                                  \
+  macro(KMP_allocate_team, 0, arg)                                             \
+  macro(KMP_setup_icv_copy, 0, arg)                                            \
+  macro(USER_icv_copy, 0, arg)                                                 \
+  macro (FOR_static_steal_stolen,                                              \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  macro (FOR_static_steal_chunks,                                              \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
 #else
 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
 #endif
+// clang-format on
 
 /*!
  * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
@@ -259,53 +291,143 @@
 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
 #undef ENUMERATE
 
-class timerPair {
-  explicit_timer_e timer_index;
-  timer_e timer;
+/*
+ * A logarithmic histogram. It accumulates the number of values in each power of
+ * ten bin.  So 1<=x<10, 10<=x<100, ...
+ * Mostly useful where we have some big outliers and want to see information
+ * about them.
+ */
+class logHistogram {
+  enum {
+    numBins = 31, /* Number of powers of 10. If this changes you need to change
+                   * the initializer for binMax */
+
+    /*
+     * If you want to use this to analyse values that may be less than 1, (for
+     * instance times in s), then the logOffset gives you negative powers.
+     * In our case here, we're just looking at times in ticks, or counts, so we
+     * can never see values with magnitude < 1 (other than zero), so we can set
+     * it to 0.  As above change the initializer if you change this.
+     */
+    logOffset = 0
+  };
+  uint32_t KMP_ALIGN_CACHE zeroCount;
+  struct {
+    uint32_t count;
+    double total;
+  } bins[numBins];
+
+  static double binMax[numBins];
+
+#ifdef KMP_DEBUG
+  uint64_t _total;
+
+  void check() const {
+    uint64_t t = zeroCount;
+    for (int i = 0; i < numBins; i++)
+      t += bins[i].count;
+    KMP_DEBUG_ASSERT(t == _total);
+  }
+#else
+  void check() const {}
+#endif
 
 public:
-  timerPair(explicit_timer_e ti, timer_e t) : timer_index(ti), timer(t) {}
-  inline explicit_timer_e get_index() const { return timer_index; }
-  inline timer_e get_timer() const { return timer; }
-  bool operator==(const timerPair &rhs) {
-    return this->get_index() == rhs.get_index();
+  logHistogram() { reset(); }
+
+  logHistogram(logHistogram const &o) {
+    for (int i = 0; i < numBins; i++)
+      bins[i] = o.bins[i];
+#ifdef KMP_DEBUG
+    _total = o._total;
+#endif
   }
-  bool operator!=(const timerPair &rhs) { return !(*this == rhs); }
+
+  void reset() {
+    zeroCount = 0;
+    for (int i = 0; i < numBins; i++) {
+      bins[i].count = 0;
+      bins[i].total = 0;
+    }
+
+#ifdef KMP_DEBUG
+    _total = 0;
+#endif
+  }
+  uint32_t count(int b) const { return bins[b + logOffset].count; }
+  double total(int b) const { return bins[b + logOffset].total; }
+  static uint32_t findBin(double sample);
+
+  logHistogram &operator+=(logHistogram const &o) {
+    zeroCount += o.zeroCount;
+    for (int i = 0; i < numBins; i++) {
+      bins[i].count += o.bins[i].count;
+      bins[i].total += o.bins[i].total;
+    }
+#ifdef KMP_DEBUG
+    _total += o._total;
+    check();
+#endif
+
+    return *this;
+  }
+
+  void addSample(double sample);
+  int minBin() const;
+  int maxBin() const;
+
+  std::string format(char) const;
 };
 
 class statistic {
-  double minVal;
+  double KMP_ALIGN_CACHE minVal;
   double maxVal;
   double meanVal;
   double m2;
   uint64_t sampleCount;
+  double offset;
+  bool collectingHist;
+  logHistogram hist;
 
 public:
-  statistic() { reset(); }
+  statistic(bool doHist = bool(KMP_STATS_HIST)) {
+    reset();
+    collectingHist = doHist;
+  }
   statistic(statistic const &o)
       : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
-        sampleCount(o.sampleCount) {}
-
+        sampleCount(o.sampleCount), offset(o.offset),
+        collectingHist(o.collectingHist), hist(o.hist) {}
+  statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
+      : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
+        sampleCount(sc), offset(0.0), collectingHist(false) {}
+  bool haveHist() const { return collectingHist; }
   double getMin() const { return minVal; }
   double getMean() const { return meanVal; }
   double getMax() const { return maxVal; }
   uint64_t getCount() const { return sampleCount; }
   double getSD() const { return sqrt(m2 / sampleCount); }
   double getTotal() const { return sampleCount * meanVal; }
+  logHistogram const *getHist() const { return &hist; }
+  void setOffset(double d) { offset = d; }
 
   void reset() {
     minVal = std::numeric_limits<double>::max();
-    maxVal = -std::numeric_limits<double>::max();
+    maxVal = -minVal;
     meanVal = 0.0;
     m2 = 0.0;
     sampleCount = 0;
+    offset = 0.0;
+    hist.reset();
   }
   void addSample(double sample);
   void scale(double factor);
   void scaleDown(double f) { scale(1. / f); }
+  void forceCount(uint64_t count) { sampleCount = count; }
   statistic &operator+=(statistic const &other);
 
   std::string format(char unit, bool total = false) const;
+  std::string formatHist(char unit) const { return hist.format(unit); }
 };
 
 struct statInfo {
@@ -346,39 +468,29 @@
 // to live on the stack of the thread, they're more work to use.
 class explicitTimer {
   timeStat *stat;
+  timer_e timerEnumValue;
   tsc_tick_count startTime;
   tsc_tick_count pauseStartTime;
   tsc_tick_count::tsc_interval_t totalPauseTime;
 
 public:
-  explicitTimer()
-      : stat(0), startTime(0), pauseStartTime(0), totalPauseTime() {}
-  explicitTimer(timeStat *s)
-      : stat(s), startTime(), pauseStartTime(0), totalPauseTime() {}
+  explicitTimer(timeStat *s, timer_e te)
+      : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
+        totalPauseTime() {}
 
-  void setStat(timeStat *s) { stat = s; }
-  void start(timer_e timerEnumValue);
-  void pause() { pauseStartTime = tsc_tick_count::now(); }
-  void resume() { totalPauseTime += (tsc_tick_count::now() - pauseStartTime); }
-  void stop(timer_e timerEnumValue, kmp_stats_list *stats_ptr = nullptr);
+  // void setStat(timeStat *s) { stat = s; }
+  void start(tsc_tick_count tick);
+  void pause(tsc_tick_count tick) { pauseStartTime = tick; }
+  void resume(tsc_tick_count tick) {
+    totalPauseTime += (tick - pauseStartTime);
+  }
+  void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
   void reset() {
     startTime = 0;
     pauseStartTime = 0;
     totalPauseTime = 0;
   }
-};
-
-// Where all you need is to time a block, this is enough.
-// (It avoids the need to have an explicit end, leaving the scope suffices.)
-class blockTimer : public explicitTimer {
-  timer_e timerEnumValue;
-
-public:
-  blockTimer(timeStat *s, timer_e newTimerEnumValue)
-      : timerEnumValue(newTimerEnumValue), explicitTimer(s) {
-    start(timerEnumValue);
-  }
-  ~blockTimer() { stop(timerEnumValue); }
+  timer_e get_type() const { return timerEnumValue; }
 };
 
 // Where you need to partition a threads clock ticks into separate states
@@ -389,14 +501,13 @@
 // versa
 class partitionedTimers {
 private:
-  explicitTimer *timers[EXPLICIT_TIMER_LAST + 1];
-  std::vector<timerPair> timer_stack;
+  std::vector<explicitTimer> timer_stack;
 
 public:
   partitionedTimers();
-  void add_timer(explicit_timer_e timer_index, explicitTimer *timer_pointer);
-  void init(timerPair timer_index);
-  void push(timerPair timer_index);
+  void init(explicitTimer timer);
+  void exchange(explicitTimer timer);
+  void push(explicitTimer timer);
   void pop();
   void windup();
 };
@@ -405,12 +516,11 @@
 // It avoids the need to have an explicit end, leaving the scope suffices.
 class blockPartitionedTimer {
   partitionedTimers *part_timers;
-  timerPair timer_pair;
 
 public:
-  blockPartitionedTimer(partitionedTimers *pt, timerPair tp)
-      : part_timers(pt), timer_pair(tp) {
-    part_timers->push(timer_pair);
+  blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
+      : part_timers(pt) {
+    part_timers->push(timer);
   }
   ~blockPartitionedTimer() { part_timers->pop(); }
 };
@@ -596,7 +706,7 @@
   int gtid;
   timeStat _timers[TIMER_LAST + 1];
   counter _counters[COUNTER_LAST + 1];
-  explicitTimer _explicitTimers[EXPLICIT_TIMER_LAST + 1];
+  explicitTimer thread_life_timer;
   partitionedTimers _partitionedTimers;
   int _nestLevel; // one per thread
   kmp_stats_event_vector _event_vector;
@@ -607,28 +717,21 @@
 
 public:
   kmp_stats_list()
-      : _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
-        thread_is_idle_flag(0) {
-#define doInit(name, ignore1, ignore2)                                         \
-  getExplicitTimer(EXPLICIT_TIMER_##name)->setStat(getTimer(TIMER_##name));    \
-  _partitionedTimers.add_timer(EXPLICIT_TIMER_##name,                          \
-                               getExplicitTimer(EXPLICIT_TIMER_##name));
-    KMP_FOREACH_EXPLICIT_TIMER(doInit, 0);
-#undef doInit
-  }
+      : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
+                          TIMER_OMP_worker_thread_life),
+        _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
+        thread_is_idle_flag(0) {}
   ~kmp_stats_list() {}
   inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
   inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
-  inline explicitTimer *getExplicitTimer(explicit_timer_e idx) {
-    return &_explicitTimers[idx];
-  }
   inline partitionedTimers *getPartitionedTimers() {
     return &_partitionedTimers;
   }
   inline timeStat *getTimers() { return _timers; }
   inline counter *getCounters() { return _counters; }
-  inline explicitTimer *getExplicitTimers() { return _explicitTimers; }
   inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
+  inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
+  inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
   inline void resetEventVector() { _event_vector.reset(); }
   inline void incrementNestValue() { _nestLevel++; }
   inline int getNestValue() { return _nestLevel; }
@@ -761,22 +864,6 @@
 // Simple, standard interfaces that drop out completely if stats aren't enabled
 
 /*!
- * \brief Uses specified timer (name) to time code block.
- *
- * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
- *
- * \details Use KMP_TIME_BLOCK(name) macro to time a code block.  This will
- * record the time taken in the block and use the destructor to stop the timer.
- * Convenient! With this definition you can't have more than one KMP_TIME_BLOCK
- * in the same code block. I don't think that's a problem.
- *
- * @ingroup STATS_GATHERING
-*/
-#define KMP_TIME_BLOCK(name)                                                   \
-  blockTimer __BLOCKTIME__(__kmp_stats_thread_ptr->getTimer(TIMER_##name),     \
-                           TIMER_##name)
-
-/*!
  * \brief Adds value to specified timer (name).
  *
  * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
@@ -804,44 +891,6 @@
   __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
 
 /*!
- * \brief "Starts" an explicit timer which will need a corresponding
- * KMP_STOP_EXPLICIT_TIMER() macro.
- *
- * @param name explicit timer name as specified under the
- * KMP_FOREACH_EXPLICIT_TIMER() macro
- *
- * \details Use to start a timer.  This will need a corresponding
- * KMP_STOP_EXPLICIT_TIMER() macro to stop the timer unlike the
- * KMP_TIME_BLOCK(name) macro which has an implicit stopping macro at the end
- * of the code block.  All explicit timers are stopped at library exit time
- * before the final statistics are outputted.
- *
- * @ingroup STATS_GATHERING
-*/
-#define KMP_START_EXPLICIT_TIMER(name)                                         \
-  __kmp_stats_thread_ptr->getExplicitTimer(EXPLICIT_TIMER_##name)              \
-      ->start(TIMER_##name)
-
-/*!
- * \brief "Stops" an explicit timer.
- *
- * @param name explicit timer name as specified under the
- * KMP_FOREACH_EXPLICIT_TIMER() macro
- *
- * \details Use KMP_STOP_EXPLICIT_TIMER(name) to stop a timer.  When this is
- * done, the time between the last KMP_START_EXPLICIT_TIMER(name) and this
- * KMP_STOP_EXPLICIT_TIMER(name) will be added to the timer's stat value. The
- * timer will then be reset. After the KMP_STOP_EXPLICIT_TIMER(name) macro is
- * called, another call to KMP_START_EXPLICIT_TIMER(name) will start the timer
- * once again.
- *
- * @ingroup STATS_GATHERING
-*/
-#define KMP_STOP_EXPLICIT_TIMER(name)                                          \
-  __kmp_stats_thread_ptr->getExplicitTimer(EXPLICIT_TIMER_##name)              \
-      ->stop(TIMER_##name)
-
-/*!
  * \brief Outputs the current thread statistics and reset them.
  *
  * @param heading_string heading put above the final stats output
@@ -868,21 +917,26 @@
  * @ingroup STATS_GATHERING
 */
 #define KMP_INIT_PARTITIONED_TIMERS(name)                                      \
-  __kmp_stats_thread_ptr->getPartitionedTimers()->init(                        \
-      timerPair(EXPLICIT_TIMER_##name, TIMER_##name))
+  __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer(          \
+      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
 
 #define KMP_TIME_PARTITIONED_BLOCK(name)                                       \
   blockPartitionedTimer __PBLOCKTIME__(                                        \
       __kmp_stats_thread_ptr->getPartitionedTimers(),                          \
-      timerPair(EXPLICIT_TIMER_##name, TIMER_##name))
+      explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name),            \
+                    TIMER_##name))
 
 #define KMP_PUSH_PARTITIONED_TIMER(name)                                       \
-  __kmp_stats_thread_ptr->getPartitionedTimers()->push(                        \
-      timerPair(EXPLICIT_TIMER_##name, TIMER_##name))
+  __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer(          \
+      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
 
 #define KMP_POP_PARTITIONED_TIMER()                                            \
   __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
 
+#define KMP_EXCHANGE_PARTITIONED_TIMER(name)                                   \
+  __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer(      \
+      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
+
 #define KMP_SET_THREAD_STATE(state_name)                                       \
   __kmp_stats_thread_ptr->setState(state_name)
 
diff --git a/runtime/src/kmp_stats_timing.cpp b/runtime/src/kmp_stats_timing.cpp
index fe85dc4..2fcbaab 100644
--- a/runtime/src/kmp_stats_timing.cpp
+++ b/runtime/src/kmp_stats_timing.cpp
@@ -95,10 +95,11 @@
     static struct {
       double scale;
       char prefix;
-    } ranges[] = {{1.e12, 'f'},  {1.e9, 'p'},   {1.e6, 'n'},   {1.e3, 'u'},
-                  {1.0, 'm'},    {1.e-3, ' '},  {1.e-6, 'k'},  {1.e-9, 'M'},
-                  {1.e-12, 'G'}, {1.e-15, 'T'}, {1.e-18, 'P'}, {1.e-21, 'E'},
-                  {1.e-24, 'Z'}, {1.e-27, 'Y'}};
+    } ranges[] = {{1.e21, 'y'},  {1.e18, 'z'},  {1.e15, 'a'},  {1.e12, 'f'},
+                  {1.e9, 'p'},   {1.e6, 'n'},   {1.e3, 'u'},   {1.0, 'm'},
+                  {1.e-3, ' '},  {1.e-6, 'k'},  {1.e-9, 'M'},  {1.e-12, 'G'},
+                  {1.e-15, 'T'}, {1.e-18, 'P'}, {1.e-21, 'E'}, {1.e-24, 'Z'},
+                  {1.e-27, 'Y'}};
 
     if (interval == 0.0) {
       os << std::setw(width - 3) << std::right << "0.00" << std::setw(3)
diff --git a/runtime/src/kmp_stub.cpp b/runtime/src/kmp_stub.cpp
index 4051d9e..6e11e82 100644
--- a/runtime/src/kmp_stub.cpp
+++ b/runtime/src/kmp_stub.cpp
@@ -46,7 +46,9 @@
 #define kmp_realloc kmpc_realloc
 #define kmp_free kmpc_free
 
+#if KMP_OS_WINDOWS
 static double frequency = 0.0;
+#endif
 
 // Helper functions.
 static size_t __kmps_init() {
diff --git a/runtime/src/kmp_taskdeps.cpp b/runtime/src/kmp_taskdeps.cpp
index dedf022..4efd12b 100644
--- a/runtime/src/kmp_taskdeps.cpp
+++ b/runtime/src/kmp_taskdeps.cpp
@@ -32,7 +32,7 @@
 // TODO: Any ITT support needed?
 
 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
-static kmp_int32 kmp_node_id_seed = 0;
+static std::atomic<kmp_int32> kmp_node_id_seed = ATOMIC_VAR_INIT(0);
 #endif
 
 static void __kmp_init_node(kmp_depnode_t *node) {
@@ -40,14 +40,15 @@
   // task once dependences have been processed
   node->dn.successors = NULL;
   __kmp_init_lock(&node->dn.lock);
-  node->dn.nrefs = 1; // init creates the first reference to the node
+  KMP_ATOMIC_ST_RLX(&node->dn.nrefs,
+                    1); // init creates the first reference to the node
 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
-  node->dn.id = KMP_TEST_THEN_INC32(&kmp_node_id_seed);
+  node->dn.id = KMP_ATOMIC_INC(&kmp_node_id_seed);
 #endif
 }
 
 static inline kmp_depnode_t *__kmp_node_ref(kmp_depnode_t *node) {
-  KMP_TEST_THEN_INC32(CCAST(kmp_int32 *, &node->dn.nrefs));
+  KMP_ATOMIC_INC(&node->dn.nrefs);
   return node;
 }
 
@@ -55,7 +56,7 @@
   if (!node)
     return;
 
-  kmp_int32 n = KMP_TEST_THEN_DEC32(CCAST(kmp_int32 *, &node->dn.nrefs)) - 1;
+  kmp_int32 n = KMP_ATOMIC_DEC(&node->dn.nrefs) - 1;
   if (n == 0) {
     KMP_ASSERT(node->dn.nrefs == 0);
 #if USE_FAST_MEMORY
@@ -375,9 +376,7 @@
   // any outstandig dependences (some tasks may have finished while we processed
   // the dependences)
   npredecessors =
-      KMP_TEST_THEN_ADD32(CCAST(kmp_int32 *, &node->dn.npredecessors),
-                          npredecessors) +
-      npredecessors;
+      node->dn.npredecessors.fetch_add(npredecessors) + npredecessors;
 
   KA_TRACE(20, ("__kmp_check_deps: T#%d found %d predecessors for task %p \n",
                 gtid, npredecessors, taskdata));
@@ -413,9 +412,8 @@
   kmp_depnode_list_t *next;
   for (kmp_depnode_list_t *p = node->dn.successors; p; p = next) {
     kmp_depnode_t *successor = p->node;
-    kmp_int32 npredecessors =
-        KMP_TEST_THEN_DEC32(CCAST(kmp_int32 *, &successor->dn.npredecessors)) -
-        1;
+    kmp_int32 npredecessors = KMP_ATOMIC_DEC(&successor->dn.npredecessors) - 1;
+
     // successor task can be NULL for wait_depends or because deps are still
     // being processed
     if (npredecessors == 0) {
@@ -648,7 +646,7 @@
     return;
   }
 
-  kmp_depnode_t node;
+  kmp_depnode_t node = {0};
   __kmp_init_node(&node);
 
   if (!__kmp_check_deps(gtid, &node, NULL, current_task->td_dephash,
@@ -661,12 +659,10 @@
   }
 
   int thread_finished = FALSE;
-  kmp_flag_32 flag((volatile kmp_uint32 *)&(node.dn.npredecessors), 0U);
+  kmp_flag_32 flag((std::atomic<kmp_uint32> *)&node.dn.npredecessors, 0U);
   while (node.dn.npredecessors > 0) {
-    flag.execute_tasks(thread, gtid, FALSE, &thread_finished,
-#if USE_ITT_BUILD
-                       NULL,
-#endif
+    flag.execute_tasks(thread, gtid, FALSE,
+                       &thread_finished USE_ITT_BUILD_ARG(NULL),
                        __kmp_task_stealing_constraint);
   }
 
diff --git a/runtime/src/kmp_tasking.cpp b/runtime/src/kmp_tasking.cpp
index 657d314..5a4df0c 100644
--- a/runtime/src/kmp_tasking.cpp
+++ b/runtime/src/kmp_tasking.cpp
@@ -264,7 +264,8 @@
   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
     // untied task needs to increment counter so that the task structure is not
     // freed prematurely
-    kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
+    kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
+    KMP_DEBUG_USE_VAR(counter);
     KA_TRACE(
         20,
         ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
@@ -488,13 +489,10 @@
 
 // __ompt_task_finish:
 //   Build and trigger final task-schedule event
-static inline void __ompt_task_finish(kmp_task_t *task,
-                                      kmp_taskdata_t *resumed_task) {
+static inline void
+__ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task,
+                   ompt_task_status_t status = ompt_task_complete) {
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
-  ompt_task_status_t status = ompt_task_complete;
-  if (taskdata->td_flags.tiedness == TASK_UNTIED &&
-      KMP_TEST_THEN_ADD32(&(taskdata->td_untied_count), 0) > 1)
-    status = ompt_task_others;
   if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
       taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
     status = ompt_task_cancel;
@@ -528,7 +526,8 @@
   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
     // untied task needs to increment counter so that the task structure is not
     // freed prematurely
-    kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
+    kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
+    KMP_DEBUG_USE_VAR(counter);
     KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
                   "incremented for task %p\n",
                   gtid, counter, taskdata));
@@ -625,9 +624,9 @@
   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
-  KMP_DEBUG_ASSERT(TCR_4(taskdata->td_allocated_child_tasks) == 0 ||
+  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
                    taskdata->td_flags.task_serial == 1);
-  KMP_DEBUG_ASSERT(TCR_4(taskdata->td_incomplete_child_tasks) == 0);
+  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
 
   taskdata->td_flags.freed = 1;
   ANNOTATE_HAPPENS_BEFORE(taskdata);
@@ -662,8 +661,7 @@
 #endif
   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
 
-  kmp_int32 children =
-      KMP_TEST_THEN_DEC32(&taskdata->td_allocated_child_tasks) - 1;
+  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
   KMP_DEBUG_ASSERT(children >= 0);
 
   // Now, go up the ancestor tree to see if any ancestors can now be freed.
@@ -685,7 +683,7 @@
       return;
 
     // Predecrement simulated by "- 1" calculation
-    children = KMP_TEST_THEN_DEC32(&taskdata->td_allocated_child_tasks) - 1;
+    children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
     KMP_DEBUG_ASSERT(children >= 0);
   }
 
@@ -700,6 +698,7 @@
 // gtid: global thread ID for calling thread
 // task: task to be finished
 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
+template <bool ompt>
 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
                               kmp_taskdata_t *resumed_task) {
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
@@ -724,7 +723,7 @@
   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
     // untied task needs to check the counter so that the task structure is not
     // freed prematurely
-    kmp_int32 counter = KMP_TEST_THEN_DEC32(&taskdata->td_untied_count) - 1;
+    kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
     KA_TRACE(
         20,
         ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
@@ -745,6 +744,10 @@
       return;
     }
   }
+#if OMPT_SUPPORT
+  if (ompt)
+    __ompt_task_finish(task, resumed_task);
+#endif
 
   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
   taskdata->td_flags.complete = 1; // mark the task as completed
@@ -756,12 +759,11 @@
   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
     // Predecrement simulated by "- 1" calculation
     children =
-        KMP_TEST_THEN_DEC32(&taskdata->td_parent->td_incomplete_child_tasks) -
-        1;
+        KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
     KMP_DEBUG_ASSERT(children >= 0);
 #if OMP_40_ENABLED
     if (taskdata->td_taskgroup)
-      KMP_TEST_THEN_DEC32((kmp_int32 *)(&taskdata->td_taskgroup->count));
+      KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
 #if OMP_45_ENABLED
   }
   // if we found proxy tasks there could exist a dependency chain
@@ -837,14 +839,13 @@
   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
   // this routine will provide task to resume
-  __kmp_task_finish(gtid, task, NULL);
+  __kmp_task_finish<ompt>(gtid, task, NULL);
 
   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
 
 #if OMPT_SUPPORT
   if (ompt) {
-    __ompt_task_finish(task, NULL);
     omp_frame_t *ompt_frame;
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame = NULL;
@@ -886,7 +887,8 @@
   KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
 
-  __kmp_task_finish(gtid, task, NULL); // Not sure how to find task to resume
+  __kmp_task_finish<false>(gtid, task,
+                           NULL); // Not sure how to find task to resume
 
   KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
@@ -945,9 +947,9 @@
   task->td_last_tied = task;
 
   if (set_curr_task) { // only do this init first time thread is created
-    task->td_incomplete_child_tasks = 0;
+    KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
     // Not used: don't need to deallocate implicit task
-    task->td_allocated_child_tasks = 0;
+    KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
 #if OMP_40_ENABLED
     task->td_taskgroup = NULL; // An implicit task does not have taskgroup
     task->td_dephash = NULL;
@@ -1025,6 +1027,9 @@
   kmp_taskdata_t *parent_task = thread->th.th_current_task;
   size_t shareds_offset;
 
+  if (!TCR_4(__kmp_init_middle))
+    __kmp_middle_initialize();
+
   KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
                 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
@@ -1130,7 +1135,7 @@
   taskdata->td_alloc_thread = thread;
   taskdata->td_parent = parent_task;
   taskdata->td_level = parent_task->td_level + 1; // increment nesting level
-  taskdata->td_untied_count = 0;
+  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
   taskdata->td_ident = loc_ref;
   taskdata->td_taskwait_ident = NULL;
   taskdata->td_taskwait_counter = 0;
@@ -1176,9 +1181,9 @@
 
   taskdata->td_flags.native = flags->native;
 
-  taskdata->td_incomplete_child_tasks = 0;
-  taskdata->td_allocated_child_tasks = 1; // start at one because counts current
-// task and children
+  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
+  // start at one because counts current task and children
+  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
 #if OMP_40_ENABLED
   taskdata->td_taskgroup =
       parent_task->td_taskgroup; // task inherits taskgroup from the parent task
@@ -1190,6 +1195,10 @@
   else
     taskdata->td_last_tied = taskdata;
 
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled))
+    __ompt_task_init(taskdata, gtid);
+#endif
 // Only need to keep track of child task counts if team parallel and tasking not
 // serialized or if it is a proxy task
 #if OMP_45_ENABLED
@@ -1199,15 +1208,15 @@
   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
 #endif
   {
-    KMP_TEST_THEN_INC32(&parent_task->td_incomplete_child_tasks);
+    KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
 #if OMP_40_ENABLED
     if (parent_task->td_taskgroup)
-      KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_taskgroup->count));
+      KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
 #endif
     // Only need to keep track of allocated child tasks for explicit tasks since
     // implicit not deallocated
     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
-      KMP_TEST_THEN_INC32(&taskdata->td_parent->td_allocated_child_tasks);
+      KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
     }
   }
 
@@ -1215,11 +1224,6 @@
                 gtid, taskdata, taskdata->td_parent));
   ANNOTATE_HAPPENS_BEFORE(task);
 
-#if OMPT_SUPPORT
-  if (UNLIKELY(ompt_enabled.enabled))
-    __ompt_task_init(taskdata, gtid);
-#endif
-
   return task;
 }
 
@@ -1262,7 +1266,6 @@
 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
                               kmp_taskdata_t *current_task) {
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
-  kmp_uint64 cur_time;
 #if OMP_40_ENABLED
   int discard = 0 /* false */;
 #endif
@@ -1291,6 +1294,7 @@
 #endif
 
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
+  kmp_uint64 cur_time;
   if (__kmp_forkjoin_frames_mode == 3) {
     // Get the current time stamp to measure task execution time to correct
     // barrier imbalance time
@@ -1298,17 +1302,9 @@
   }
 #endif
 
-#if OMP_45_ENABLED
-  // Proxy tasks are not handled by the runtime
-  if (taskdata->td_flags.proxy != TASK_PROXY) {
-#endif
-    ANNOTATE_HAPPENS_AFTER(task);
-    __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
-#if OMP_45_ENABLED
-  }
-#endif
-
 #if OMPT_SUPPORT
+  // For untied tasks, the first task executed only calls __kmpc_omp_task and
+  // does not execute code.
   ompt_thread_info_t oldInfo;
   kmp_info_t *thread;
   if (UNLIKELY(ompt_enabled.enabled)) {
@@ -1323,6 +1319,16 @@
   }
 #endif
 
+#if OMP_45_ENABLED
+  // Proxy tasks are not handled by the runtime
+  if (taskdata->td_flags.proxy != TASK_PROXY) {
+#endif
+    ANNOTATE_HAPPENS_AFTER(task);
+    __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
+#if OMP_45_ENABLED
+  }
+#endif
+
 #if OMP_40_ENABLED
   // TODO: cancel tasks if the parallel region has also been cancelled
   // TODO: check if this sequence can be hoisted above __kmp_task_start
@@ -1399,27 +1405,26 @@
     }
     KMP_POP_PARTITIONED_TIMER();
 
-#if OMPT_SUPPORT
-    if (UNLIKELY(ompt_enabled.enabled))
-      __ompt_task_finish(task, current_task);
-#endif
 #if OMP_40_ENABLED
   }
 #endif // OMP_40_ENABLED
 
-#if OMPT_SUPPORT
-  if (UNLIKELY(ompt_enabled.enabled)) {
-    thread->th.ompt_thread_info = oldInfo;
-    taskdata->ompt_task_info.frame.exit_frame = NULL;
-  }
-#endif
 
 #if OMP_45_ENABLED
   // Proxy tasks are not handled by the runtime
   if (taskdata->td_flags.proxy != TASK_PROXY) {
 #endif
     ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
-    __kmp_task_finish(gtid, task, current_task); // OMPT only if not discarded
+#if OMPT_SUPPORT
+    if (UNLIKELY(ompt_enabled.enabled)) {
+      thread->th.ompt_thread_info = oldInfo;
+      if (taskdata->td_flags.tiedness == TASK_TIED) {
+        taskdata->ompt_task_info.frame.exit_frame = NULL;
+      }
+      __kmp_task_finish<true>(gtid, task, current_task);
+    } else
+#endif
+      __kmp_task_finish<false>(gtid, task, current_task);
 #if OMP_45_ENABLED
   }
 #endif
@@ -1556,8 +1561,73 @@
 
 #if OMPT_SUPPORT
   kmp_taskdata_t *parent = NULL;
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    if (!new_taskdata->td_flags.started) {
+      OMPT_STORE_RETURN_ADDRESS(gtid);
+      parent = new_taskdata->td_parent;
+      if (!parent->ompt_task_info.frame.enter_frame) {
+        parent->ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
+      }
+      if (ompt_enabled.ompt_callback_task_create) {
+        ompt_data_t task_data = ompt_data_none;
+        ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+            parent ? &(parent->ompt_task_info.task_data) : &task_data,
+            parent ? &(parent->ompt_task_info.frame) : NULL,
+            &(new_taskdata->ompt_task_info.task_data),
+            ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
+            OMPT_LOAD_RETURN_ADDRESS(gtid));
+      }
+    } else {
+      // We are scheduling the continuation of an UNTIED task.
+      // Scheduling back to the parent task.
+      __ompt_task_finish(new_task,
+                         new_taskdata->ompt_task_info.scheduling_parent,
+                         ompt_task_others);
+      new_taskdata->ompt_task_info.frame.exit_frame = NULL;
+    }
+  }
+#endif
+
+  res = __kmp_omp_task(gtid, new_task, true);
+
+  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
+                "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
+                gtid, loc_ref, new_taskdata));
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
+    parent->ompt_task_info.frame.enter_frame = NULL;
+  }
+#endif
+  return res;
+}
+
+// __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
+// a taskloop task with the correct OMPT return address
+//
+// loc_ref: location of original task pragma (ignored)
+// gtid: Global Thread ID of encountering thread
+// new_task: non-thread-switchable task thunk allocated by
+// __kmp_omp_task_alloc()
+// codeptr_ra: return address for OMPT callback
+// Returns:
+//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
+//    be resumed later.
+//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
+//    resumed later.
+kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
+                                  kmp_task_t *new_task, void *codeptr_ra) {
+  kmp_int32 res;
+  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
+
+#if KMP_DEBUG || OMPT_SUPPORT
+  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+#endif
+  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
+                new_taskdata));
+
+#if OMPT_SUPPORT
+  kmp_taskdata_t *parent = NULL;
   if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
-    OMPT_STORE_RETURN_ADDRESS(gtid);
     parent = new_taskdata->td_parent;
     if (!parent->ompt_task_info.frame.enter_frame)
       parent->ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
@@ -1568,7 +1638,7 @@
           parent ? &(parent->ompt_task_info.frame) : NULL,
           &(new_taskdata->ompt_task_info.task_data),
           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
-          OMPT_LOAD_RETURN_ADDRESS(gtid));
+          codeptr_ra);
     }
   }
 #endif
@@ -1648,10 +1718,10 @@
                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
 #endif
     if (must_wait) {
-      kmp_flag_32 flag(
-          RCAST(volatile kmp_uint32 *, &taskdata->td_incomplete_child_tasks),
-          0U);
-      while (TCR_4(taskdata->td_incomplete_child_tasks) != 0) {
+      kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
+                             &(taskdata->td_incomplete_child_tasks)),
+                       0U);
+      while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
         flag.execute_tasks(thread, gtid, FALSE,
                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
                            __kmp_task_stealing_constraint);
@@ -1982,8 +2052,8 @@
   kmp_taskgroup_t *tg_new =
       (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
   KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
-  tg_new->count = 0;
-  tg_new->cancel_request = cancel_noreq;
+  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
+  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
   tg_new->parent = taskdata->td_taskgroup;
 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
 #if OMP_45_ENABLED
@@ -2066,8 +2136,9 @@
     if (!taskdata->td_flags.team_serial)
 #endif
     {
-      kmp_flag_32 flag(RCAST(kmp_uint32 *, &taskgroup->count), 0U);
-      while (TCR_4(taskgroup->count) != 0) {
+      kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)),
+                       0U);
+      while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
         flag.execute_tasks(thread, gtid, FALSE,
                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
                            __kmp_task_stealing_constraint);
@@ -2205,7 +2276,7 @@
 // task_team thread_data before calling this routine.
 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
                                     kmp_task_team_t *task_team,
-                                    volatile kmp_int32 *unfinished_threads,
+                                    std::atomic<kmp_int32> *unfinished_threads,
                                     int *thread_finished,
                                     kmp_int32 is_constrained) {
   kmp_task_t *task;
@@ -2333,8 +2404,9 @@
       victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
       prev = target;
     }
-    KMP_DEBUG_ASSERT(victim_td->td.td_deque_tail ==
-                     ((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
+    KMP_DEBUG_ASSERT(
+        victim_td->td.td_deque_tail ==
+        (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
     victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
   }
   if (*thread_finished) {
@@ -2343,7 +2415,7 @@
     // master victim) might be prematurely released from the barrier!!!
     kmp_int32 count;
 
-    count = KMP_TEST_THEN_INC32(unfinished_threads);
+    count = KMP_ATOMIC_INC(unfinished_threads);
 
     KA_TRACE(
         20,
@@ -2386,7 +2458,7 @@
   kmp_task_t *task;
   kmp_info_t *other_thread;
   kmp_taskdata_t *current_task = thread->th.th_current_task;
-  volatile kmp_int32 *unfinished_threads;
+  std::atomic<kmp_int32> *unfinished_threads;
   kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
                       tid = thread->th.th_info.ds.ds_tid;
 
@@ -2411,7 +2483,7 @@
 #else
   KMP_DEBUG_ASSERT(nthreads > 1);
 #endif
-  KMP_DEBUG_ASSERT(TCR_4(*unfinished_threads) >= 0);
+  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
 
   while (1) { // Outer loop keeps trying to find tasks in case of single thread
     // getting tasks from target constructs
@@ -2541,7 +2613,8 @@
 #if OMP_45_ENABLED
     // The work queue may be empty but there might be proxy tasks still
     // executing
-    if (final_spin && TCR_4(current_task->td_incomplete_child_tasks) == 0)
+    if (final_spin &&
+        KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0)
 #else
     if (final_spin)
 #endif
@@ -2552,7 +2625,7 @@
       if (!*thread_finished) {
         kmp_int32 count;
 
-        count = KMP_TEST_THEN_DEC32(unfinished_threads) - 1;
+        count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
                       "unfinished_threads to %d task_team=%p\n",
                       gtid, count, task_team));
@@ -2979,13 +3052,13 @@
 #endif
   task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
 
-  TCW_4(task_team->tt.tt_unfinished_threads, nthreads);
+  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
   TCW_4(task_team->tt.tt_active, TRUE);
 
   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
                 "unfinished_threads init'd to %d\n",
                 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
-                task_team->tt.tt_unfinished_threads));
+                KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
   return task_team;
 }
 
@@ -3143,7 +3216,8 @@
 #if OMP_45_ENABLED
         TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
 #endif
-        TCW_4(task_team->tt.tt_unfinished_threads, team->t.t_nproc);
+        KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
+                          team->t.t_nproc);
         TCW_4(task_team->tt.tt_active, TRUE);
       }
       // if team size has changed, the first thread to enable tasking will
@@ -3200,9 +3274,9 @@
       // Worker threads may have dropped through to release phase, but could
       // still be executing tasks. Wait here for tasks to complete. To avoid
       // memory contention, only master thread checks termination condition.
-      kmp_flag_32 flag(
-          RCAST(volatile kmp_uint32 *, &task_team->tt.tt_unfinished_threads),
-          0U);
+      kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
+                             &task_team->tt.tt_unfinished_threads),
+                       0U);
       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
     }
     // Deactivate the old task team, so that the worker threads will stop
@@ -3233,21 +3307,21 @@
 // barrier. It is a full barrier itself, which unfortunately turns regular
 // barriers into double barriers and join barriers into 1 1/2 barriers.
 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
-  volatile kmp_uint32 *spin = RCAST(
-      volatile kmp_uint32 *,
+  std::atomic<kmp_uint32> *spin = RCAST(
+      std::atomic<kmp_uint32> *,
       &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
   int flag = FALSE;
   KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
 
 #if USE_ITT_BUILD
-  KMP_FSYNC_SPIN_INIT(spin, (kmp_uint32 *)NULL);
+  KMP_FSYNC_SPIN_INIT(spin, NULL);
 #endif /* USE_ITT_BUILD */
   kmp_flag_32 spin_flag(spin, 0U);
   while (!spin_flag.execute_tasks(thread, gtid, TRUE,
                                   &flag USE_ITT_BUILD_ARG(NULL), 0)) {
 #if USE_ITT_BUILD
     // TODO: What about itt_sync_obj??
-    KMP_FSYNC_SPIN_PREPARE(CCAST(kmp_uint32 *, spin));
+    KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
 #endif /* USE_ITT_BUILD */
 
     if (TCR_4(__kmp_global.g.g_done)) {
@@ -3258,7 +3332,7 @@
     KMP_YIELD(TRUE); // GH: We always yield here
   }
 #if USE_ITT_BUILD
-  KMP_FSYNC_SPIN_ACQUIRED(CCAST(kmp_uint32 *, spin));
+  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
 #endif /* USE_ITT_BUILD */
 }
 
@@ -3370,11 +3444,11 @@
   taskdata->td_flags.complete = 1; // mark the task as completed
 
   if (taskdata->td_taskgroup)
-    KMP_TEST_THEN_DEC32(&taskdata->td_taskgroup->count);
+    KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
 
   // Create an imaginary children for this task so the bottom half cannot
   // release the task before we have completed the second top half
-  TCI_4(taskdata->td_incomplete_child_tasks);
+  KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks);
 }
 
 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
@@ -3382,11 +3456,11 @@
 
   // Predecrement simulated by "- 1" calculation
   children =
-      KMP_TEST_THEN_DEC32(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
+      KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
   KMP_DEBUG_ASSERT(children >= 0);
 
   // Remove the imaginary children
-  TCD_4(taskdata->td_incomplete_child_tasks);
+  KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks);
 }
 
 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
@@ -3399,7 +3473,7 @@
 
   // We need to wait to make sure the top half is finished
   // Spinning here should be ok as this should happen quickly
-  while (TCR_4(taskdata->td_incomplete_child_tasks) > 0)
+  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0)
     ;
 
   __kmp_release_deps(gtid, taskdata);
@@ -3534,13 +3608,13 @@
   // Only need to keep track of child task counts if team parallel and tasking
   // not serialized
   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
-    KMP_TEST_THEN_INC32(&parent_task->td_incomplete_child_tasks);
+    KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
     if (parent_task->td_taskgroup)
-      KMP_TEST_THEN_INC32(&parent_task->td_taskgroup->count);
+      KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
     // Only need to keep track of allocated child tasks for explicit tasks since
     // implicit not deallocated
     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
-      KMP_TEST_THEN_INC32(&taskdata->td_parent->td_allocated_child_tasks);
+      KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
   }
 
   KA_TRACE(20,
@@ -3667,23 +3741,28 @@
 
 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
 //
-// loc       Source location information
-// gtid      Global thread ID
-// task      Pattern task, exposes the loop iteration range
-// lb        Pointer to loop lower bound in task structure
-// ub        Pointer to loop upper bound in task structure
-// st        Loop stride
-// ub_glob   Global upper bound (used for lastprivate check)
-// num_tasks Number of tasks to execute
-// grainsize Number of loop iterations per task
-// extras    Number of chunks with grainsize+1 iterations
-// tc        Iterations count
-// task_dup  Tasks duplication routine
+// loc        Source location information
+// gtid       Global thread ID
+// task       Pattern task, exposes the loop iteration range
+// lb         Pointer to loop lower bound in task structure
+// ub         Pointer to loop upper bound in task structure
+// st         Loop stride
+// ub_glob    Global upper bound (used for lastprivate check)
+// num_tasks  Number of tasks to execute
+// grainsize  Number of loop iterations per task
+// extras     Number of chunks with grainsize+1 iterations
+// tc         Iterations count
+// task_dup   Tasks duplication routine
+// codeptr_ra Return address for OMPT events
 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
                            kmp_uint64 ub_glob, kmp_uint64 num_tasks,
                            kmp_uint64 grainsize, kmp_uint64 extras,
-                           kmp_uint64 tc, void *task_dup) {
+                           kmp_uint64 tc,
+#if OMPT_SUPPORT
+                           void *codeptr_ra,
+#endif
+                           void *task_dup) {
   KMP_COUNT_BLOCK(OMP_TASKLOOP);
   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
@@ -3751,13 +3830,18 @@
               gtid, i, next_task, lower, upper, st,
               next_task_bounds.get_lower_offset(),
               next_task_bounds.get_upper_offset()));
+#if OMPT_SUPPORT
+    __kmp_omp_taskloop_task(NULL, gtid, next_task,
+                           codeptr_ra); // schedule new task
+#else
     __kmp_omp_task(gtid, next_task, true); // schedule new task
+#endif
     lower = upper + st; // adjust lower bound for the next iteration
   }
   // free the pattern task and exit
   __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
   // do not execute the pattern task, just do internal bookkeeping
-  __kmp_task_finish(gtid, task, current_task);
+  __kmp_task_finish<false>(gtid, task, current_task);
 }
 
 // Structure to keep taskloop parameters for auxiliary task
@@ -3774,11 +3858,17 @@
   kmp_uint64 extras;
   kmp_uint64 tc;
   kmp_uint64 num_t_min;
+#if OMPT_SUPPORT
+  void *codeptr_ra;
+#endif
 } __taskloop_params_t;
 
 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
                           kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
                           kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
+#if OMPT_SUPPORT
+                          void *,
+#endif
                           void *);
 
 // Execute part of the the taskloop submitted as a task.
@@ -3797,6 +3887,9 @@
   kmp_uint64 extras = p->extras;
   kmp_uint64 tc = p->tc;
   kmp_uint64 num_t_min = p->num_t_min;
+#if OMPT_SUPPORT
+  void *codeptr_ra = p->codeptr_ra;
+#endif
 #if KMP_DEBUG
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   KMP_DEBUG_ASSERT(task != NULL);
@@ -3808,10 +3901,18 @@
   KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
   if (num_tasks > num_t_min)
     __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
-                         grainsize, extras, tc, num_t_min, task_dup);
+                         grainsize, extras, tc, num_t_min,
+#if OMPT_SUPPORT
+                         codeptr_ra,
+#endif
+                         task_dup);
   else
     __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
-                          grainsize, extras, tc, task_dup);
+                          grainsize, extras, tc,
+#if OMPT_SUPPORT
+                          codeptr_ra,
+#endif
+                          task_dup);
 
   KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
   return 0;
@@ -3820,24 +3921,29 @@
 // Schedule part of the the taskloop as a task,
 // execute the rest of the the taskloop.
 //
-// loc       Source location information
-// gtid      Global thread ID
-// task      Pattern task, exposes the loop iteration range
-// lb        Pointer to loop lower bound in task structure
-// ub        Pointer to loop upper bound in task structure
-// st        Loop stride
-// ub_glob   Global upper bound (used for lastprivate check)
-// num_tasks Number of tasks to execute
-// grainsize Number of loop iterations per task
-// extras    Number of chunks with grainsize+1 iterations
-// tc        Iterations count
-// num_t_min Threashold to launch tasks recursively
-// task_dup  Tasks duplication routine
+// loc        Source location information
+// gtid       Global thread ID
+// task       Pattern task, exposes the loop iteration range
+// lb         Pointer to loop lower bound in task structure
+// ub         Pointer to loop upper bound in task structure
+// st         Loop stride
+// ub_glob    Global upper bound (used for lastprivate check)
+// num_tasks  Number of tasks to execute
+// grainsize  Number of loop iterations per task
+// extras     Number of chunks with grainsize+1 iterations
+// tc         Iterations count
+// num_t_min  Threashold to launch tasks recursively
+// task_dup   Tasks duplication routine
+// codeptr_ra Return address for OMPT events
 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
                           kmp_uint64 grainsize, kmp_uint64 extras,
-                          kmp_uint64 tc, kmp_uint64 num_t_min, void *task_dup) {
+                          kmp_uint64 tc, kmp_uint64 num_t_min,
+#if OMPT_SUPPORT
+                          void *codeptr_ra,
+#endif
+                          void *task_dup) {
 #if KMP_DEBUG
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   KMP_DEBUG_ASSERT(task != NULL);
@@ -3849,11 +3955,9 @@
 #endif
   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
   kmp_uint64 lower = *lb;
-  kmp_uint64 upper = *ub;
   kmp_info_t *thread = __kmp_threads[gtid];
   //  kmp_taskdata_t *current_task = thread->th.th_current_task;
   kmp_task_t *next_task;
-  kmp_int32 lastpriv = 0;
   size_t lower_offset =
       (char *)lb - (char *)task; // remember offset of lb in the task structure
   size_t upper_offset =
@@ -3907,15 +4011,32 @@
   p->extras = ext1;
   p->tc = tc1;
   p->num_t_min = num_t_min;
+#if OMPT_SUPPORT
+  p->codeptr_ra = codeptr_ra;
+#endif
+
+#if OMPT_SUPPORT
+  // schedule new task with correct return address for OMPT events
+  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
+#else
   __kmp_omp_task(gtid, new_task, true); // schedule new task
+#endif
 
   // execute the 1st half of current subrange
   if (n_tsk0 > num_t_min)
     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
-                         ext0, tc0, num_t_min, task_dup);
+                         ext0, tc0, num_t_min,
+#if OMPT_SUPPORT
+                         codeptr_ra,
+#endif
+                         task_dup);
   else
     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
-                          gr_size0, ext0, tc0, task_dup);
+                          gr_size0, ext0, tc0,
+#if OMPT_SUPPORT
+                          codeptr_ra,
+#endif
+                          task_dup);
 
   KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid));
 }
@@ -3942,16 +4063,6 @@
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   KMP_DEBUG_ASSERT(task != NULL);
 
-#if OMPT_SUPPORT && OMPT_OPTIONAL
-  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
-  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
-  if (ompt_enabled.ompt_callback_work) {
-    ompt_callbacks.ompt_callback(ompt_callback_work)(
-        ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
-        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
-  }
-#endif
-
   if (nogroup == 0) {
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     OMPT_STORE_RETURN_ADDRESS(gtid);
@@ -3989,9 +4100,20 @@
     // free the pattern task and exit
     __kmp_task_start(gtid, task, current_task);
     // do not execute anything for zero-trip loop
-    __kmp_task_finish(gtid, task, current_task);
+    __kmp_task_finish<false>(gtid, task, current_task);
     return;
   }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
+        &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+
   if (num_tasks_min == 0)
     // TODO: can we choose better default heuristic?
     num_tasks_min =
@@ -4038,47 +4160,51 @@
   if (if_val == 0) { // if(0) specified, mark task as serial
     taskdata->td_flags.task_serial = 1;
     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
-#if OMPT_SUPPORT && OMPT_OPTIONAL
-    OMPT_STORE_RETURN_ADDRESS(gtid);
-#endif
     // always start serial tasks linearly
     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
-                          grainsize, extras, tc, task_dup);
+                          grainsize, extras, tc,
+#if OMPT_SUPPORT
+                          OMPT_GET_RETURN_ADDRESS(0),
+#endif
+                          task_dup);
     // !taskdata->td_flags.native => currently force linear spawning of tasks
     // for GOMP_taskloop
   } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
     KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
                   "(%lld), grain %llu, extras %llu\n",
                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
-#if OMPT_SUPPORT && OMPT_OPTIONAL
-    OMPT_STORE_RETURN_ADDRESS(gtid);
-#endif
     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
-                         grainsize, extras, tc, num_tasks_min, task_dup);
+                         grainsize, extras, tc, num_tasks_min,
+#if OMPT_SUPPORT
+                         OMPT_GET_RETURN_ADDRESS(0),
+#endif
+                         task_dup);
   } else {
     KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
                   "(%lld), grain %llu, extras %llu\n",
                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
-#if OMPT_SUPPORT && OMPT_OPTIONAL
-    OMPT_STORE_RETURN_ADDRESS(gtid);
-#endif
     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
-                          grainsize, extras, tc, task_dup);
+                          grainsize, extras, tc,
+#if OMPT_SUPPORT
+                          OMPT_GET_RETURN_ADDRESS(0),
+#endif
+                          task_dup);
   }
 
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
+        &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+
   if (nogroup == 0) {
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
     __kmpc_end_taskgroup(loc, gtid);
   }
-#if OMPT_SUPPORT && OMPT_OPTIONAL
-  if (ompt_enabled.ompt_callback_work) {
-    ompt_callbacks.ompt_callback(ompt_callback_work)(
-        ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
-        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
-  }
-#endif
   KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
 }
 
diff --git a/runtime/src/kmp_threadprivate.cpp b/runtime/src/kmp_threadprivate.cpp
index e64097b..d1ca422 100644
--- a/runtime/src/kmp_threadprivate.cpp
+++ b/runtime/src/kmp_threadprivate.cpp
@@ -719,7 +719,7 @@
       // compilers use new method.)
       (void)KMP_COMPARE_AND_STORE_PTR(tp_cache_addr->compiler_cache, old_cache,
                                       my_cache);
-      //TCW_PTR(*(tp_cache_addr->compiler_cache), my_cache);
+      // TCW_PTR(*(tp_cache_addr->compiler_cache), my_cache);
 
       // If the store doesn't happen here, the compiler's old behavior will
       // inevitably call __kmpc_threadprivate_cache with a new location for the
diff --git a/runtime/src/kmp_wait_release.cpp b/runtime/src/kmp_wait_release.cpp
index 98d3a1f..3ebec6b 100644
--- a/runtime/src/kmp_wait_release.cpp
+++ b/runtime/src/kmp_wait_release.cpp
@@ -15,8 +15,12 @@
 
 void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag,
                    int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
-  __kmp_wait_template(this_thr, flag,
-                      final_spin USE_ITT_BUILD_ARG(itt_sync_obj));
+  if (final_spin)
+    __kmp_wait_template<kmp_flag_64, TRUE>(
+        this_thr, flag USE_ITT_BUILD_ARG(itt_sync_obj));
+  else
+    __kmp_wait_template<kmp_flag_64, FALSE>(
+        this_thr, flag USE_ITT_BUILD_ARG(itt_sync_obj));
 }
 
 void __kmp_release_64(kmp_flag_64 *flag) { __kmp_release_template(flag); }
diff --git a/runtime/src/kmp_wait_release.h b/runtime/src/kmp_wait_release.h
index 1f519d5..71c334d 100644
--- a/runtime/src/kmp_wait_release.h
+++ b/runtime/src/kmp_wait_release.h
@@ -46,26 +46,56 @@
 /*!
  * Base class for wait/release volatile flag
  */
+template <typename P> class kmp_flag_native {
+  volatile P *loc;
+  flag_type t;
+
+public:
+  typedef P flag_t;
+  kmp_flag_native(volatile P *p, flag_type ft) : loc(p), t(ft) {}
+  volatile P *get() { return loc; }
+  void *get_void_p() { return RCAST(void *, CCAST(P *, loc)); }
+  void set(volatile P *new_loc) { loc = new_loc; }
+  flag_type get_type() { return t; }
+  P load() { return *loc; }
+  void store(P val) { *loc = val; }
+};
+
+/*!
+ * Base class for wait/release atomic flag
+ */
 template <typename P> class kmp_flag {
-  volatile P
+  std::atomic<P>
       *loc; /**< Pointer to the flag storage that is modified by another thread
              */
   flag_type t; /**< "Type" of the flag in loc */
 public:
   typedef P flag_t;
-  kmp_flag(volatile P *p, flag_type ft) : loc(p), t(ft) {}
+  kmp_flag(std::atomic<P> *p, flag_type ft) : loc(p), t(ft) {}
   /*!
    * @result the pointer to the actual flag
    */
-  volatile P *get() { return loc; }
+  std::atomic<P> *get() { return loc; }
+  /*!
+   * @result void* pointer to the actual flag
+   */
+  void *get_void_p() { return RCAST(void *, loc); }
   /*!
    * @param new_loc in   set loc to point at new_loc
    */
-  void set(volatile P *new_loc) { loc = new_loc; }
+  void set(std::atomic<P> *new_loc) { loc = new_loc; }
   /*!
    * @result the flag_type
    */
   flag_type get_type() { return t; }
+  /*!
+   * @result flag value
+   */
+  P load() { return loc->load(std::memory_order_acquire); }
+  /*!
+   * @param val the new flag value to be stored
+   */
+  void store(P val) { loc->store(val, std::memory_order_release); }
   // Derived classes must provide the following:
   /*
   kmp_info_t * get_waiter(kmp_uint32 i);
@@ -128,29 +158,37 @@
 
 /* Spin wait loop that first does pause, then yield, then sleep. A thread that
    calls __kmp_wait_*  must make certain that another thread calls __kmp_release
-   to wake it back up to prevent deadlocks!  */
-template <class C>
+   to wake it back up to prevent deadlocks!
+
+   NOTE: We may not belong to a team at this point.  */
+template <class C, int final_spin>
 static inline void
-__kmp_wait_template(kmp_info_t *this_thr, C *flag,
-                    int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
-  // NOTE: We may not belong to a team at this point.
-  volatile typename C::flag_t *spin = flag->get();
+__kmp_wait_template(kmp_info_t *this_thr,
+                    C *flag USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  volatile void *spin = flag->get();
+#endif
   kmp_uint32 spins;
-  kmp_uint32 hibernate;
   int th_gtid;
   int tasks_completed = FALSE;
   int oversubscribed;
 #if !KMP_USE_MONITOR
   kmp_uint64 poll_count;
   kmp_uint64 hibernate_goal;
+#else
+  kmp_uint32 hibernate;
 #endif
 
   KMP_FSYNC_SPIN_INIT(spin, NULL);
   if (flag->done_check()) {
-    KMP_FSYNC_SPIN_ACQUIRED(CCAST(typename C::flag_t *, spin));
+    KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
     return;
   }
   th_gtid = this_thr->th.th_info.ds.ds_gtid;
+#if KMP_OS_UNIX
+  if (final_spin)
+    KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
+#endif
   KA_TRACE(20,
            ("__kmp_wait_sleep: T#%d waiting for flag(%p)\n", th_gtid, flag));
 #if KMP_STATS_ENABLED
@@ -318,7 +356,7 @@
       } // if
     } // if
 
-    KMP_FSYNC_SPIN_PREPARE(CCAST(typename C::flag_t *, spin));
+    KMP_FSYNC_SPIN_PREPARE(CCAST(void *, spin));
     if (TCR_4(__kmp_global.g.g_done)) {
       if (__kmp_global.g.g_abort)
         __kmp_abort_thread();
@@ -340,7 +378,7 @@
     in_pool = !!TCR_4(this_thr->th.th_in_pool);
     if (in_pool != !!this_thr->th.th_active_in_pool) {
       if (in_pool) { // Recently transferred from team to pool
-        KMP_TEST_THEN_INC32(&__kmp_thread_pool_active_nth);
+        KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
         this_thr->th.th_active_in_pool = TRUE;
         /* Here, we cannot assert that:
            KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) <=
@@ -350,7 +388,7 @@
            inc/dec'd asynchronously by the workers. The two can get out of sync
            for brief periods of time.  */
       } else { // Recently transferred from pool to team
-        KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
+        KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
         KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
         this_thr->th.th_active_in_pool = FALSE;
       }
@@ -384,7 +422,15 @@
 #endif
 
     KF_TRACE(50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid));
+#if KMP_OS_UNIX
+    if (final_spin)
+      KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
+#endif
     flag->suspend(th_gtid);
+#if KMP_OS_UNIX
+    if (final_spin)
+      KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
+#endif
 
     if (TCR_4(__kmp_global.g.g_done)) {
       if (__kmp_global.g.g_abort)
@@ -425,7 +471,11 @@
   }
 #endif
 
-  KMP_FSYNC_SPIN_ACQUIRED(CCAST(typename C::flag_t *, spin));
+#if KMP_OS_UNIX
+  if (final_spin)
+    KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
+#endif
+  KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
 }
 
 /* Release any threads specified as waiting on the flag by releasing the flag
@@ -438,12 +488,12 @@
 #endif
   KF_TRACE(20, ("__kmp_release: T#%d releasing flag(%x)\n", gtid, flag->get()));
   KMP_DEBUG_ASSERT(flag->get());
-  KMP_FSYNC_RELEASING(CCAST(typename C::flag_t *, flag->get()));
+  KMP_FSYNC_RELEASING(flag->get_void_p());
 
   flag->internal_release();
 
   KF_TRACE(100, ("__kmp_release: T#%d set new spin=%d\n", gtid, flag->get(),
-                 *(flag->get())));
+                 flag->load()));
 
   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
     // Only need to check sleep stuff if infinite block time not set.
@@ -497,7 +547,9 @@
   }
 };
 
-template <typename FlagType> class kmp_basic_flag : public kmp_flag<FlagType> {
+// Basic flag that does not use C11 Atomics
+template <typename FlagType>
+class kmp_basic_flag_native : public kmp_flag_native<FlagType> {
   typedef flag_traits<FlagType> traits_type;
   FlagType checker; /**< Value to compare flag to to check if flag has been
                        released. */
@@ -506,14 +558,14 @@
   kmp_uint32
       num_waiting_threads; /**< Number of threads sleeping on this thread. */
 public:
-  kmp_basic_flag(volatile FlagType *p)
-      : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
-  kmp_basic_flag(volatile FlagType *p, kmp_info_t *thr)
-      : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(1) {
+  kmp_basic_flag_native(volatile FlagType *p)
+      : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
+  kmp_basic_flag_native(volatile FlagType *p, kmp_info_t *thr)
+      : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(1) {
     waiting_threads[0] = thr;
   }
-  kmp_basic_flag(volatile FlagType *p, FlagType c)
-      : kmp_flag<FlagType>(p, traits_type::t), checker(c),
+  kmp_basic_flag_native(volatile FlagType *p, FlagType c)
+      : kmp_flag_native<FlagType>(p, traits_type::t), checker(c),
         num_waiting_threads(0) {}
   /*!
    * param i in   index into waiting_threads
@@ -594,12 +646,105 @@
   enum barrier_type get_bt() { return bs_last_barrier; }
 };
 
+template <typename FlagType> class kmp_basic_flag : public kmp_flag<FlagType> {
+  typedef flag_traits<FlagType> traits_type;
+  FlagType checker; /**< Value to compare flag to to check if flag has been
+                       released. */
+  kmp_info_t
+      *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
+  kmp_uint32
+      num_waiting_threads; /**< Number of threads sleeping on this thread. */
+public:
+  kmp_basic_flag(std::atomic<FlagType> *p)
+      : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
+  kmp_basic_flag(std::atomic<FlagType> *p, kmp_info_t *thr)
+      : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(1) {
+    waiting_threads[0] = thr;
+  }
+  kmp_basic_flag(std::atomic<FlagType> *p, FlagType c)
+      : kmp_flag<FlagType>(p, traits_type::t), checker(c),
+        num_waiting_threads(0) {}
+  /*!
+   * param i in   index into waiting_threads
+   * @result the thread that is waiting at index i
+   */
+  kmp_info_t *get_waiter(kmp_uint32 i) {
+    KMP_DEBUG_ASSERT(i < num_waiting_threads);
+    return waiting_threads[i];
+  }
+  /*!
+   * @result num_waiting_threads
+   */
+  kmp_uint32 get_num_waiters() { return num_waiting_threads; }
+  /*!
+   * @param thr in   the thread which is now waiting
+   *
+   * Insert a waiting thread at index 0.
+   */
+  void set_waiter(kmp_info_t *thr) {
+    waiting_threads[0] = thr;
+    num_waiting_threads = 1;
+  }
+  /*!
+   * @result true if the flag object has been released.
+   */
+  bool done_check() { return this->load() == checker; }
+  /*!
+   * @param old_loc in   old value of flag
+   * @result true if the flag's old value indicates it was released.
+   */
+  bool done_check_val(FlagType old_loc) { return old_loc == checker; }
+  /*!
+   * @result true if the flag object is not yet released.
+   * Used in __kmp_wait_template like:
+   * @code
+   * while (flag.notdone_check()) { pause(); }
+   * @endcode
+   */
+  bool notdone_check() { return this->load() != checker; }
+  /*!
+   * @result Actual flag value before release was applied.
+   * Trigger all waiting threads to run by modifying flag to release state.
+   */
+  void internal_release() { KMP_ATOMIC_ADD(this->get(), 4); }
+  /*!
+   * @result Actual flag value before sleep bit(s) set.
+   * Notes that there is at least one thread sleeping on the flag by setting
+   * sleep bit(s).
+   */
+  FlagType set_sleeping() {
+    return KMP_ATOMIC_OR(this->get(), KMP_BARRIER_SLEEP_STATE);
+  }
+  /*!
+   * @result Actual flag value before sleep bit(s) cleared.
+   * Notes that there are no longer threads sleeping on the flag by clearing
+   * sleep bit(s).
+   */
+  FlagType unset_sleeping() {
+    return KMP_ATOMIC_AND(this->get(), ~KMP_BARRIER_SLEEP_STATE);
+  }
+  /*!
+   * @param old_loc in   old value of flag
+   * Test whether there are threads sleeping on the flag's old value in old_loc.
+   */
+  bool is_sleeping_val(FlagType old_loc) {
+    return old_loc & KMP_BARRIER_SLEEP_STATE;
+  }
+  /*!
+   * Test whether there are threads sleeping on the flag.
+   */
+  bool is_sleeping() { return is_sleeping_val(this->load()); }
+  bool is_any_sleeping() { return is_sleeping_val(this->load()); }
+  kmp_uint8 *get_stolen() { return NULL; }
+  enum barrier_type get_bt() { return bs_last_barrier; }
+};
+
 class kmp_flag_32 : public kmp_basic_flag<kmp_uint32> {
 public:
-  kmp_flag_32(volatile kmp_uint32 *p) : kmp_basic_flag<kmp_uint32>(p) {}
-  kmp_flag_32(volatile kmp_uint32 *p, kmp_info_t *thr)
+  kmp_flag_32(std::atomic<kmp_uint32> *p) : kmp_basic_flag<kmp_uint32>(p) {}
+  kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_info_t *thr)
       : kmp_basic_flag<kmp_uint32>(p, thr) {}
-  kmp_flag_32(volatile kmp_uint32 *p, kmp_uint32 c)
+  kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_uint32 c)
       : kmp_basic_flag<kmp_uint32>(p, c) {}
   void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); }
   void resume(int th_gtid) { __kmp_resume_32(th_gtid, this); }
@@ -612,20 +757,24 @@
   }
   void wait(kmp_info_t *this_thr,
             int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
-    __kmp_wait_template(this_thr, this,
-                        final_spin USE_ITT_BUILD_ARG(itt_sync_obj));
+    if (final_spin)
+      __kmp_wait_template<kmp_flag_32, TRUE>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+    else
+      __kmp_wait_template<kmp_flag_32, FALSE>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
   }
   void release() { __kmp_release_template(this); }
   flag_type get_ptr_type() { return flag32; }
 };
 
-class kmp_flag_64 : public kmp_basic_flag<kmp_uint64> {
+class kmp_flag_64 : public kmp_basic_flag_native<kmp_uint64> {
 public:
-  kmp_flag_64(volatile kmp_uint64 *p) : kmp_basic_flag<kmp_uint64>(p) {}
+  kmp_flag_64(volatile kmp_uint64 *p) : kmp_basic_flag_native<kmp_uint64>(p) {}
   kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr)
-      : kmp_basic_flag<kmp_uint64>(p, thr) {}
+      : kmp_basic_flag_native<kmp_uint64>(p, thr) {}
   kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c)
-      : kmp_basic_flag<kmp_uint64>(p, c) {}
+      : kmp_basic_flag_native<kmp_uint64>(p, c) {}
   void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); }
   void resume(int th_gtid) { __kmp_resume_64(th_gtid, this); }
   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
@@ -637,15 +786,19 @@
   }
   void wait(kmp_info_t *this_thr,
             int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
-    __kmp_wait_template(this_thr, this,
-                        final_spin USE_ITT_BUILD_ARG(itt_sync_obj));
+    if (final_spin)
+      __kmp_wait_template<kmp_flag_64, TRUE>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+    else
+      __kmp_wait_template<kmp_flag_64, FALSE>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
   }
   void release() { __kmp_release_template(this); }
   flag_type get_ptr_type() { return flag64; }
 };
 
 // Hierarchical 64-bit on-core barrier instantiation
-class kmp_flag_oncore : public kmp_flag<kmp_uint64> {
+class kmp_flag_oncore : public kmp_flag_native<kmp_uint64> {
   kmp_uint64 checker;
   kmp_info_t *waiting_threads[1];
   kmp_uint32 num_waiting_threads;
@@ -665,27 +818,17 @@
 
 public:
   kmp_flag_oncore(volatile kmp_uint64 *p)
-      : kmp_flag<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
+      : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
         flag_switch(false) {}
   kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint32 idx)
-      : kmp_flag<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
+      : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
         offset(idx), flag_switch(false) {}
   kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint64 c, kmp_uint32 idx,
-                  enum barrier_type bar_t, kmp_info_t *thr
-#if USE_ITT_BUILD
-                  ,
-                  void *itt
-#endif
-                  )
-      : kmp_flag<kmp_uint64>(p, flag_oncore), checker(c),
+                  enum barrier_type bar_t,
+                  kmp_info_t *thr USE_ITT_BUILD_ARG(void *itt))
+      : kmp_flag_native<kmp_uint64>(p, flag_oncore), checker(c),
         num_waiting_threads(0), offset(idx), flag_switch(false), bt(bar_t),
-        this_thr(thr)
-#if USE_ITT_BUILD
-        ,
-        itt_sync_obj(itt)
-#endif
-  {
-  }
+        this_thr(thr) USE_ITT_BUILD_ARG(itt_sync_obj(itt)) {}
   kmp_info_t *get_waiter(kmp_uint32 i) {
     KMP_DEBUG_ASSERT(i < num_waiting_threads);
     return waiting_threads[i];
@@ -709,16 +852,12 @@
       this_thr->th.th_bar[bt].bb.wait_flag = KMP_BARRIER_SWITCHING;
       kmp_flag_64 flag(&this_thr->th.th_bar[bt].bb.b_go,
                        (kmp_uint64)KMP_BARRIER_STATE_BUMP);
-      __kmp_wait_64(this_thr, &flag, TRUE
-#if USE_ITT_BUILD
-                    ,
-                    itt_sync_obj
-#endif
-                    );
+      __kmp_wait_64(this_thr, &flag, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
     }
     return false;
   }
   void internal_release() {
+    // Other threads can write their own bytes simultaneously.
     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
       byteref(get(), offset) = 1;
     } else {
@@ -739,8 +878,12 @@
   bool is_sleeping() { return is_sleeping_val(*get()); }
   bool is_any_sleeping() { return is_sleeping_val(*get()); }
   void wait(kmp_info_t *this_thr, int final_spin) {
-    __kmp_wait_template<kmp_flag_oncore>(
-        this_thr, this, final_spin USE_ITT_BUILD_ARG(itt_sync_obj));
+    if (final_spin)
+      __kmp_wait_template<kmp_flag_oncore, TRUE>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+    else
+      __kmp_wait_template<kmp_flag_oncore, FALSE>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
   }
   void release() { __kmp_release_template(this); }
   void suspend(int th_gtid) { __kmp_suspend_oncore(th_gtid, this); }
diff --git a/runtime/src/ompt-general.cpp b/runtime/src/ompt-general.cpp
index 1ef0820..25ad19d 100644
--- a/runtime/src/ompt-general.cpp
+++ b/runtime/src/ompt-general.cpp
@@ -643,7 +643,10 @@
  * compatability
  ****************************************************************************/
 
+/*
+ * Currently unused function
 OMPT_API_ROUTINE int ompt_get_ompt_version() { return OMPT_VERSION; }
+*/
 
 /*****************************************************************************
 * application-facing API
diff --git a/runtime/src/ompt-specific.cpp b/runtime/src/ompt-specific.cpp
index ad65808..23d09aa 100644
--- a/runtime/src/ompt-specific.cpp
+++ b/runtime/src/ompt-specific.cpp
@@ -341,18 +341,23 @@
   ompt_task_info_t *info = NULL;
   ompt_team_info_t *team_info = NULL;
   kmp_info_t *thr = ompt_get_thread();
+  int level = ancestor_level;
 
   if (thr) {
     kmp_taskdata_t *taskdata = thr->th.th_current_task;
     if (taskdata == NULL)
       return 0;
-    kmp_team *team = thr->th.th_team;
+    kmp_team *team = thr->th.th_team, *prev_team = NULL;
     if (team == NULL)
       return 0;
     ompt_lw_taskteam_t *lwt = NULL,
-                       *next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+                       *next_lwt = LWT_FROM_TEAM(taskdata->td_team),
+                       *prev_lwt = NULL;
 
     while (ancestor_level > 0) {
+      // needed for thread_num
+      prev_team = team;
+      prev_lwt = lwt;
       // next lightweight team (if any)
       if (lwt)
         lwt = lwt->parent;
@@ -410,7 +415,13 @@
       *parallel_data = team_info ? &(team_info->parallel_data) : NULL;
     }
     if (thread_num) {
-      *thread_num = __kmp_get_gtid();
+      if (level == 0)
+        *thread_num = __kmp_get_tid();
+      else if (prev_lwt)
+        *thread_num = 0;
+      else
+        *thread_num = prev_team->t.t_master_tid;
+      //        *thread_num = team->t.t_master_tid;
     }
     return info ? 2 : 0;
   }
diff --git a/runtime/src/ompt-specific.h b/runtime/src/ompt-specific.h
index 450f4c4..28905bb 100644
--- a/runtime/src/ompt-specific.h
+++ b/runtime/src/ompt-specific.h
@@ -52,7 +52,10 @@
 
 ompt_data_t *__ompt_get_thread_data_internal();
 
+/*
+ * Unused currently
 static uint64_t __ompt_get_get_unique_id_internal();
+*/
 
 /*****************************************************************************
  * macros
diff --git a/runtime/src/thirdparty/ittnotify/ittnotify_static.c b/runtime/src/thirdparty/ittnotify/ittnotify_static.c
index ffc87ca..16edbd1 100644
--- a/runtime/src/thirdparty/ittnotify/ittnotify_static.c
+++ b/runtime/src/thirdparty/ittnotify/ittnotify_static.c
@@ -94,8 +94,6 @@
         __itt_mutex_lock(&p.mutex);                                  \
 }
 
-const int _N_(err) = 0;
-
 typedef int (__itt_init_ittlib_t)(const char*, __itt_group_id);
 
 /* this define used to control initialization function name. */
@@ -490,7 +488,7 @@
 
 static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setW),_init))(const wchar_t* name, int namelen)
 {
-    namelen = namelen;
+    (void)namelen;
     ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(name);
     return 0;
 }
@@ -538,14 +536,14 @@
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setA),_init))(const char* name, int namelen)
 {
-    namelen = namelen;
+    (void)namelen;
     ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init))(name);
     return 0;
 }
 #else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_set),_init))(const char* name, int namelen)
 {
-    namelen = namelen;
+    (void)namelen;
     ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init))(name);
     return 0;
 }
diff --git a/runtime/src/z_Linux_util.cpp b/runtime/src/z_Linux_util.cpp
index 26c45a4..acb3452 100644
--- a/runtime/src/z_Linux_util.cpp
+++ b/runtime/src/z_Linux_util.cpp
@@ -510,7 +510,7 @@
 #if KMP_STATS_ENABLED
   // set thread local index to point to thread-specific stats
   __kmp_stats_thread_ptr = ((kmp_info_t *)thr)->th.th_stats;
-  KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
+  __kmp_stats_thread_ptr->startLife();
   KMP_SET_THREAD_STATE(IDLE);
   KMP_INIT_PARTITIONED_TIMERS(OMP_idle);
 #endif
@@ -1441,7 +1441,7 @@
 
   KF_TRACE(5, ("__kmp_suspend_template: T#%d set sleep bit for spin(%p)==%x,"
                " was %x\n",
-               th_gtid, flag->get(), *(flag->get()), old_spin));
+               th_gtid, flag->get(), flag->load(), old_spin));
 
   if (flag->done_check_val(old_spin)) {
     old_spin = flag->unset_sleeping();
@@ -1469,7 +1469,7 @@
         th->th.th_active = FALSE;
         if (th->th.th_active_in_pool) {
           th->th.th_active_in_pool = FALSE;
-          KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
+          KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
           KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
         }
         deactivated = TRUE;
@@ -1525,7 +1525,7 @@
     if (deactivated) {
       th->th.th_active = TRUE;
       if (TCR_4(th->th.th_in_pool)) {
-        KMP_TEST_THEN_INC32(&__kmp_thread_pool_active_nth);
+        KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
         th->th.th_active_in_pool = TRUE;
       }
     }
@@ -1598,7 +1598,7 @@
       KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
                    "awake: flag(%p): "
                    "%u => %u\n",
-                   gtid, target_gtid, flag->get(), old_spin, *flag->get()));
+                   gtid, target_gtid, flag->get(), old_spin, flag->load()));
       status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
       KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
       return;
@@ -1606,7 +1606,7 @@
     KF_TRACE(5, ("__kmp_resume_template: T#%d about to wakeup T#%d, reset "
                  "sleep bit for flag's loc(%p): "
                  "%u => %u\n",
-                 gtid, target_gtid, flag->get(), old_spin, *flag->get()));
+                 gtid, target_gtid, flag->get(), old_spin, flag->load()));
   }
   TCW_PTR(th->th.th_sleep_loc, NULL);
 
diff --git a/runtime/src/z_Windows_NT_util.cpp b/runtime/src/z_Windows_NT_util.cpp
index d8ffc7d..28ad36e 100644
--- a/runtime/src/z_Windows_NT_util.cpp
+++ b/runtime/src/z_Windows_NT_util.cpp
@@ -350,7 +350,7 @@
         th->th.th_active = FALSE;
         if (th->th.th_active_in_pool) {
           th->th.th_active_in_pool = FALSE;
-          KMP_TEST_THEN_DEC32((kmp_int32 *)&__kmp_thread_pool_active_nth);
+          KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
           KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
         }
         deactivated = TRUE;
@@ -375,7 +375,7 @@
     if (deactivated) {
       th->th.th_active = TRUE;
       if (TCR_4(th->th.th_in_pool)) {
-        KMP_TEST_THEN_INC32((kmp_int32 *)&__kmp_thread_pool_active_nth);
+        KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
         th->th.th_active_in_pool = TRUE;
       }
     }
diff --git a/runtime/test/lit.cfg b/runtime/test/lit.cfg
index 2296987..e4561eb 100644
--- a/runtime/test/lit.cfg
+++ b/runtime/test/lit.cfg
@@ -120,7 +120,11 @@
     if config.operating_system == 'Windows':
         # No such environment variable on Windows.
         config.substitutions.append(("%preload-tool", "true ||"))
+        config.substitutions.append(("%no-as-needed-flag", "-Wl,--no-as-needed"))
     elif config.operating_system == 'Darwin':
         config.substitutions.append(("%preload-tool", "env DYLD_INSERT_LIBRARIES=%T/tool.so"))
+        # No such linker flag on Darwin.
+        config.substitutions.append(("%no-as-needed-flag", ""))
     else:
         config.substitutions.append(("%preload-tool", "env LD_PRELOAD=%T/tool.so"))
+        config.substitutions.append(("%no-as-needed-flag", "-Wl,--no-as-needed"))
diff --git a/runtime/test/ompt/callback.h b/runtime/test/ompt/callback.h
index af871b3..f1191ad 100755
--- a/runtime/test/ompt/callback.h
+++ b/runtime/test/ompt/callback.h
@@ -1,6 +1,11 @@
+#ifndef _BSD_SOURCE
 #define _BSD_SOURCE
+#endif
 #define _DEFAULT_SOURCE
 #include <stdio.h>
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
 #include <inttypes.h>
 #include <omp.h>
 #include <ompt.h>
@@ -178,11 +183,14 @@
 // to include or discard the new number of nibbles!
 #define FUZZY_ADDRESS_DISCARD_NIBBLES 2
 #define FUZZY_ADDRESS_DISCARD_BYTES (1 << ((FUZZY_ADDRESS_DISCARD_NIBBLES) * 4))
-#define print_fuzzy_address_blocks(addr) \
-  printf("%" PRIu64 ": fuzzy_address=0x%" PRIx64 " or 0x%" PRIx64 " (%p)\n", \
-  ompt_get_thread_data()->value, \
-  ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES - 1, \
-  ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES, addr)
+#define print_fuzzy_address_blocks(addr)                                       \
+  printf("%" PRIu64 ": fuzzy_address=0x%" PRIx64 " or 0x%" PRIx64              \
+         " or 0x%" PRIx64 " or 0x%" PRIx64 " (%p)\n",                          \
+         ompt_get_thread_data()->value,                                        \
+         ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES - 1,                   \
+         ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES,                       \
+         ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES + 1,                   \
+         ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES + 2, addr)
 
 static void
 on_ompt_callback_mutex_acquire(
diff --git a/runtime/test/ompt/loadtool/tool_available/tool_available.c b/runtime/test/ompt/loadtool/tool_available/tool_available.c
index dc00b03..fbbdadd 100644
--- a/runtime/test/ompt/loadtool/tool_available/tool_available.c
+++ b/runtime/test/ompt/loadtool/tool_available/tool_available.c
@@ -7,9 +7,9 @@
 // RUN: %clang %flags -DTOOL -shared -fPIC %s -o %T/tool.so
 // 2. "introducing a dynamically-linked library that includes the tool’s definition of ompt_start_tool into the application’s address space"
 // 2.1 Link with tool during compilation
-// RUN: %libomp-compile -DCODE -Wl,--no-as-needed %T/tool.so && %libomp-run | FileCheck %s
+// RUN: %libomp-compile -DCODE %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
 // 2.2 Link with tool during compilation, but AFTER the runtime
-// RUN: %libomp-compile -DCODE -lomp -Wl,--no-as-needed %T/tool.so && %libomp-run | FileCheck %s
+// RUN: %libomp-compile -DCODE -lomp %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
 // 2.3 Inject tool via the dynamic loader
 // RUN: %libomp-compile -DCODE && %preload-tool %libomp-run | FileCheck %s
 
diff --git a/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c b/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c
index e25e737..b0d3f2b 100644
--- a/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c
+++ b/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c
@@ -7,9 +7,9 @@
 // RUN: %clang %flags -DTOOL -shared -fPIC %s -o %T/tool.so
 // 2. "introducing a dynamically-linked library that includes the tool’s definition of ompt_start_tool into the application’s address space"
 // 2.1 Link with tool during compilation
-// RUN: %libomp-compile -DCODE -Wl,--no-as-needed %T/tool.so && %libomp-run | FileCheck %s
+// RUN: %libomp-compile -DCODE %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
 // 2.2 Link with tool during compilation, but AFTER the runtime
-// RUN: %libomp-compile -DCODE -lomp -Wl,--no-as-needed %T/tool.so && %libomp-run | FileCheck %s
+// RUN: %libomp-compile -DCODE -lomp %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
 // 2.3 Inject tool via the dynamic loader
 // RUN: %libomp-compile -DCODE && %preload-tool %libomp-run | FileCheck %s
 
diff --git a/runtime/test/ompt/misc/api_calls_from_other_thread.cpp b/runtime/test/ompt/misc/api_calls_from_other_thread.cpp
index f14759f..470d7cd 100644
--- a/runtime/test/ompt/misc/api_calls_from_other_thread.cpp
+++ b/runtime/test/ompt/misc/api_calls_from_other_thread.cpp
@@ -5,26 +5,54 @@
 #include "callback.h"
 
 void f() {
-    ompt_data_t* tdata = ompt_get_thread_data();
-    uint64_t tvalue = tdata ? tdata->value : 0;
+  ompt_data_t *tdata = ompt_get_thread_data();
+  uint64_t tvalue = tdata ? tdata->value : 0;
 
-    printf("%" PRIu64 ": ompt_get_num_places()=%d\n", tvalue, ompt_get_num_places());
+  printf("%" PRIu64 ": ompt_get_num_places()=%d\n", tvalue,
+         ompt_get_num_places());
 
-    printf("%" PRIu64 ": ompt_get_place_proc_ids()=%d\n", tvalue, ompt_get_place_proc_ids(0, 0, NULL));
+  printf("%" PRIu64 ": ompt_get_place_proc_ids()=%d\n", tvalue,
+         ompt_get_place_proc_ids(0, 0, NULL));
 
-    printf("%" PRIu64 ": ompt_get_place_num()=%d\n", tvalue, ompt_get_place_num());
+  printf("%" PRIu64 ": ompt_get_place_num()=%d\n", tvalue,
+         ompt_get_place_num());
 
-    printf("%" PRIu64 ": ompt_get_partition_place_nums()=%d\n", tvalue, ompt_get_partition_place_nums(0, NULL));
+  printf("%" PRIu64 ": ompt_get_partition_place_nums()=%d\n", tvalue,
+         ompt_get_partition_place_nums(0, NULL));
 
-    printf("%" PRIu64 ": ompt_get_proc_id()=%d\n", tvalue, ompt_get_proc_id());
+  printf("%" PRIu64 ": ompt_get_proc_id()=%d\n", tvalue, ompt_get_proc_id());
 
-    printf("%" PRIu64 ": ompt_get_num_procs()=%d\n", tvalue, ompt_get_num_procs());
+  printf("%" PRIu64 ": ompt_get_num_procs()=%d\n", tvalue,
+         ompt_get_num_procs());
+
+  ompt_callback_t callback;
+  printf("%" PRIu64 ": ompt_get_callback()=%d\n", tvalue,
+         ompt_get_callback(ompt_callback_thread_begin, &callback));
+
+  printf("%" PRIu64 ": ompt_get_state()=%d\n", tvalue, ompt_get_state(NULL));
+
+  int state = omp_state_undefined;
+  const char *state_name;
+  printf("%" PRIu64 ": ompt_enumerate_states()=%d\n", tvalue,
+         ompt_enumerate_states(state, &state, &state_name));
+
+  int impl = ompt_mutex_impl_unknown;
+  const char *impl_name;
+  printf("%" PRIu64 ": ompt_enumerate_mutex_impls()=%d\n", tvalue,
+         ompt_enumerate_mutex_impls(impl, &impl, &impl_name));
+
+  printf("%" PRIu64 ": ompt_get_thread_data()=%p\n", tvalue,
+         ompt_get_thread_data());
+
+  printf("%" PRIu64 ": ompt_get_parallel_info()=%d\n", tvalue,
+         ompt_get_parallel_info(0, NULL, NULL));
+
+  printf("%" PRIu64 ": ompt_get_task_info()=%d\n", tvalue,
+         ompt_get_task_info(0, NULL, NULL, NULL, NULL, NULL));
 }
 
-
-int main()
-{
-  #pragma omp parallel num_threads(1)
+int main() {
+#pragma omp parallel num_threads(1)
   {}
 
   std::thread t1(f);
@@ -46,6 +74,19 @@
 
   // CHECK: {{^}}[[MASTER_ID]]: ompt_get_num_procs()={{[0-9]+}}
 
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_callback()=1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_state()=0
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_enumerate_states()=1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_enumerate_mutex_impls()=1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_thread_data()=[[NULL]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_parallel_info()=0
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_task_info()=0
 
   return 0;
 }
diff --git a/runtime/test/ompt/misc/interoperability.cpp b/runtime/test/ompt/misc/interoperability.cpp
index 5072f91..102e6de 100644
--- a/runtime/test/ompt/misc/interoperability.cpp
+++ b/runtime/test/ompt/misc/interoperability.cpp
@@ -3,6 +3,7 @@
 
 #include <iostream>
 #include <thread>
+#include <alloca.h>
 
 #include "callback.h"
 #include "omp.h"
@@ -15,6 +16,9 @@
   // runtime isn't initialized yet...)
   omp_get_num_threads();
 
+  // Call alloca() to force availability of frame pointer
+  void *p = alloca(0);
+
   OMPT_SIGNAL(condition);
   // Wait for both initial threads to arrive that will eventually become the
   // master threads in the following parallel region.
diff --git a/runtime/test/ompt/misc/threads.c b/runtime/test/ompt/misc/threads.c
new file mode 100644
index 0000000..4a0fc6f
--- /dev/null
+++ b/runtime/test/ompt/misc/threads.c
@@ -0,0 +1,34 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+  int x = 0;
+#pragma omp parallel num_threads(4)
+  {
+#pragma omp atomic
+    x++;
+  }
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[MASTER_ID]]
+  // CHECK: {{^}}[[WORKER_ID1:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID1]]
+  // CHECK: {{^}}[[WORKER_ID1]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID1]]
+  // CHECK: {{^}}[[WORKER_ID2:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID2]]
+  // CHECK: {{^}}[[WORKER_ID2]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID2]]
+  // CHECK: {{^}}[[WORKER_ID3:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID3]]
+  // CHECK: {{^}}[[WORKER_ID3]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID3]]
+
+  return 0;
+}
diff --git a/runtime/test/ompt/misc/threads_nested.c b/runtime/test/ompt/misc/threads_nested.c
new file mode 100644
index 0000000..0d38dcf
--- /dev/null
+++ b/runtime/test/ompt/misc/threads_nested.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+
+  int condition = 0;
+  int x = 0;
+  omp_set_nested(1);
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp parallel num_threads(2)
+    {
+      OMPT_SIGNAL(condition);
+      OMPT_WAIT(condition, 4);
+    }
+  }
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[MASTER_ID]]
+  // CHECK: {{^}}[[WORKER_ID1:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID1]]
+  // CHECK: {{^}}[[WORKER_ID1]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID1]]
+  // CHECK: {{^}}[[WORKER_ID2:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID2]]
+  // CHECK: {{^}}[[WORKER_ID2]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID2]]
+  // CHECK: {{^}}[[WORKER_ID3:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID3]]
+  // CHECK: {{^}}[[WORKER_ID3]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID3]]
+
+  return 0;
+}
diff --git a/runtime/test/ompt/parallel/nested_thread_num.c b/runtime/test/ompt/parallel/nested_thread_num.c
new file mode 100644
index 0000000..e952f80
--- /dev/null
+++ b/runtime/test/ompt/parallel/nested_thread_num.c
@@ -0,0 +1,357 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#define TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN
+#include "callback.h"
+#include <omp.h>
+#include <unistd.h>
+
+int main() {
+  int condition = 0;
+  omp_set_nested(1);
+  print_frame(0);
+
+#pragma omp parallel num_threads(2)
+  {
+    print_frame_from_outlined_fn(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+
+// get all implicit task events before starting nested:
+#pragma omp barrier
+
+#pragma omp parallel num_threads(2)
+    {
+      print_frame_from_outlined_fn(1);
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+      print_frame(0);
+      OMPT_SIGNAL(condition);
+      OMPT_WAIT(condition, 4);
+#pragma omp barrier
+      print_fuzzy_address(1);
+      print_ids(0);
+    }
+    print_fuzzy_address(2);
+    print_ids(0);
+  }
+  print_fuzzy_address(3);
+
+  return 0;
+}
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback
+
+// CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+// make sure initial data pointers are null
+// CHECK-NOT: 0: parallel_data initially not null
+// CHECK-NOT: 0: task_data initially not null
+// CHECK-NOT: 0: thread_data initially not null
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+// CHECK-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]],
+// CHECK-SAME: parent_task_frame.exit=[[NULL]],
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]],
+// CHECK-SAME: requested_team_size=2,
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// CHECK-SAME: invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+// CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+// CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end:
+
+// Note that we cannot ensure that the worker threads have already called
+// barrier_end and implicit_task_end before parallel_end!
+
+// CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+
+
+// CHECK: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], 
+// CHECK-SAME: task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+// CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+// THREADS: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+// THREADS: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
+// THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+// THREADS-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], 
+// THREADS-SAME: parent_task_frame.exit=[[NULL]],
+// THREADS-SAME: parent_task_frame.reenter=[[MAIN_REENTER]],
+// THREADS-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2,
+// THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+// nested parallel masters
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]],
+// THREADS-SAME: team_size=2, thread_num=0
+
+// THREADS: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], 
+// THREADS-SAME: reenter_frame=[[NULL]], 
+// THREADS-SAME: thread_num=0
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 1:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], 
+// THREADS-SAME: reenter_frame=[[MAIN_REENTER]]
+
+// THREADS: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin:
+// THREADS-SAME: parent_task_id=[[IMPLICIT_TASK_ID]], 
+// THREADS-SAME: parent_task_frame.exit=[[EXIT]],
+// THREADS-SAME: parent_task_frame.reenter=[[REENTER]],
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], 
+// THREADS-SAME: requested_team_size=2,
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]],
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]], team_size=2, 
+// THREADS-SAME: thread_num=0
+
+// THREADS: __builtin_frame_address({{.}})=[[NESTED_EXIT:0x[0-f]+]]
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0:
+// THREADS-SAME:  parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]], 
+// THREADS-SAME: thread_num=0
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]],
+// THREADS-SAME: reenter_frame=[[REENTER]]
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 2:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], 
+// THREADS-SAME: reenter_frame=[[MAIN_REENTER]]
+
+// THREADS: __builtin_frame_address(0)=[[NESTED_REENTER:0x[0-f]+]]
+
+// THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+// explicit barrier
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[BARRIER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0:
+// THREADS-SAME:  parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: exit_frame=[[NESTED_EXIT]], reenter_frame=[[NESTED_REENTER]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[BARRIER_RETURN_ADDRESS]]
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]]
+
+// implicit barrier
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: exit_frame=[[NULL]], reenter_frame=[[NULL]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]],
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER]],
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+
+// THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], 
+// THREADS-SAME: reenter_frame=[[NULL]]
+
+// implicit barrier
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], 
+// THREADS-SAME: reenter_frame=[[NULL]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]],
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER]], 
+// THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+// Worker of first nesting level
+
+// THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]], team_size=2, 
+// THREADS-SAME: thread_num=[[OUTER_THREADNUM:[0-9]+]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], 
+// THREADS-SAME: thread_num=[[OUTER_THREADNUM]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 1:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin:
+// THREADS-SAME: parent_task_id=[[IMPLICIT_TASK_ID]], 
+// THREADS-SAME: parent_task_frame.exit={{0x[0-f]+}},
+// THREADS-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=2,
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}},
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]],
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]], team_size=2,
+// THREADS-SAME: thread_num=[[INNER_THREADNUM:[0-9]+]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: thread_num=[[INNER_THREADNUM]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], 
+// THREADS-SAME: thread_num=[[OUTER_THREADNUM]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 2:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+
+// THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+
+// THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// nested parallel worker threads
+
+// THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+// THREADS-SAME: thread_num=[[THREADNUM:[0-9]+]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]]
+// THREADS-SAME: thread_num=[[THREADNUM]]
+
+// can't reliably tell which parallel region is the parent...
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}},
+// THREADS-SAME: task_id={{[0-9]+}}
+// THREADS-SAME: thread_num={{[01]}}
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 2:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+// THREADS-SAME: thread_num=0
+
+// THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// other nested parallel worker threads
+
+// THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+// THREADS-SAME: thread_num=[[THREADNUM:[0-9]+]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]]
+// THREADS-SAME: thread_num=[[THREADNUM]]
+
+// can't reliably tell which parallel region is the parent...
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}},
+// THREADS-SAME: task_id={{[0-9]+}}
+// THREADS-SAME: thread_num={{[01]}}
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 2:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+// THREADS-SAME: thread_num=0
+
+// THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
diff --git a/runtime/test/ompt/tasks/task_types.c b/runtime/test/ompt/tasks/task_types.c
index 6f2fdb3..40ceb2d 100644
--- a/runtime/test/ompt/tasks/task_types.c
+++ b/runtime/test/ompt/tasks/task_types.c
@@ -43,6 +43,19 @@
       // Output of thread_id is needed to know on which thread task is executed
       printf("%" PRIu64 ": explicit_untied\n", ompt_get_thread_data()->value);
       print_ids(0);
+      print_frame(1);
+      x++;
+#pragma omp taskyield
+      printf("%" PRIu64 ": explicit_untied(2)\n",
+             ompt_get_thread_data()->value);
+      print_ids(0);
+      print_frame(1);
+      x++;
+#pragma omp taskwait
+      printf("%" PRIu64 ": explicit_untied(3)\n",
+             ompt_get_thread_data()->value);
+      print_ids(0);
+      print_frame(1);
       x++;
     }
 // explicit task with final
@@ -146,8 +159,24 @@
   // may be multiple of those
   // CHECK: [[THREAD_ID_3:[0-9]+]]: explicit_untied
   // CHECK: [[THREAD_ID_3]]: task level 0: parallel_id=[[PARALLEL_ID]]
-  // CHECK-SAME: task_id=[[EXPLICIT_UNTIED_TASK_ID]], exit_frame={{[^\,]*}}
-  // CHECK-SAME: reenter_frame=[[NULL]]
+  // CHECK-SAME: task_id=[[EXPLICIT_UNTIED_TASK_ID]]
+  // CHECK-SAME: exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_untied=268435460
+  // CHECK-SAME: thread_num={{[01]}}
+
+  // after taskyield
+  // CHECK: [[THREAD_ID_3_2:[0-9]+]]: explicit_untied(2)
+  // CHECK: [[THREAD_ID_3_2]]: task level 0: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[EXPLICIT_UNTIED_TASK_ID]]
+  // CHECK-SAME: exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_untied=268435460
+  // CHECK-SAME: thread_num={{[01]}}
+
+  // after taskwait
+  // CHECK: [[THREAD_ID_3_3:[0-9]+]]: explicit_untied(3)
+  // CHECK: [[THREAD_ID_3_3]]: task level 0: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[EXPLICIT_UNTIED_TASK_ID]]
+  // CHECK-SAME: exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
   // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_untied=268435460
   // CHECK-SAME: thread_num={{[01]}}
 
diff --git a/runtime/test/ompt/tasks/taskloop.c b/runtime/test/ompt/tasks/taskloop.c
new file mode 100644
index 0000000..59a47bf
--- /dev/null
+++ b/runtime/test/ompt/tasks/taskloop.c
@@ -0,0 +1,81 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | FileCheck --check-prefix=TASKS %s
+// REQUIRES: ompt
+
+// These compilers don't support the taskloop construct
+// UNSUPPORTED: gcc-4, gcc-5, icc-16
+// GCC 6 has support for taskloops, but at least 6.3.0 is crashing on this test
+// UNSUPPORTED: gcc-6
+
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+  unsigned int i, x;
+
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp barrier
+
+#pragma omp master
+#pragma omp taskloop
+    for (i = 0; i < 5; i += 3) {
+      x++;
+    }
+  }
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+  // CHECK-SAME: parent_task_id={{[0-9]+}}
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]]
+  // CHECK-SAME: requested_team_size=2
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[IMPLICIT_TASK_ID1:[0-9]+]]
+  // CHECK-SAME: team_size=2, thread_num=0
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_taskgroup_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_taskloop_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]], count=2
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+  // CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK-SAME: new_task_id=[[TASK_ID1:[0-9]+]]
+  // CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-SAME: task_type=ompt_task_explicit=4
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+  // CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK-SAME: new_task_id=[[TASK_ID2:[0-9]+]]
+  // CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-SAME: task_type=ompt_task_explicit=4
+  // CHECK-NOT: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_taskloop_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK-SAME: count=2
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_wait_taskgroup_begin:
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_taskgroup_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_taskgroup_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=0
+  // CHECK-SAME: task_id=[[IMPLICIT_TASK_ID1]], team_size=2, thread_num=0
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+
+  // TASKS: ompt_event_task_create:{{.*}} new_task_id={{[0-9]+}}
+  // TASKS-SAME: task_type=ompt_task_initial
+  // TASKS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_taskloop_begin:
+  // TASKS: ompt_event_task_create:{{.*}} new_task_id=[[TASK_ID1:[0-9]+]]
+  // TASKS-SAME: task_type=ompt_task_explicit
+  // TASKS-DAG: ompt_event_task_create:{{.*}} new_task_id=[[TASK_ID2:[0-9]+]]
+  // Schedule events:
+  // TASKS-DAG: {{^.*}}first_task_id={{[0-9]+}}, second_task_id=[[TASK_ID1]]
+  // TASKS-DAG: {{^.*}}first_task_id=[[TASK_ID1]], second_task_id={{[0-9]+}}
+  // TASKS-DAG: {{^.*}}first_task_id={{[0-9]+}}, second_task_id=[[TASK_ID2]]
+  // TASKS-DAG: {{^.*}}first_task_id=[[TASK_ID2]], second_task_id={{[0-9]+}}
+  // TASKS-NOT: ompt_event_task_schedule
+
+  return 0;
+}
diff --git a/runtime/test/tasking/bug_36720.c b/runtime/test/tasking/bug_36720.c
new file mode 100644
index 0000000..684d675
--- /dev/null
+++ b/runtime/test/tasking/bug_36720.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile-and-run
+
+/*
+Bugzilla: https://bugs.llvm.org/show_bug.cgi?id=36720
+
+Assertion failure at kmp_runtime.cpp(1715): nthreads > 0.
+OMP: Error #13: Assertion failure at kmp_runtime.cpp(1715).
+
+The assertion fails even with OMP_NUM_THREADS=1. If the second task is removed,
+everything runs to completion. If the "omp parallel for" directives are removed
+from inside the tasks, once again everything runs fine.
+*/
+
+#define N 1024
+
+int main() {
+  #pragma omp task
+  {
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < N; i++)
+      (void)0;
+  }
+
+  #pragma omp task
+  {
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < N; ++i)
+      (void)0;
+  }
+
+  #pragma omp taskwait
+
+  return 0;
+}
diff --git a/runtime/test/tasking/omp_taskloop_grainsize.c b/runtime/test/tasking/omp_taskloop_grainsize.c
index d4bd770..0833073 100644
--- a/runtime/test/tasking/omp_taskloop_grainsize.c
+++ b/runtime/test/tasking/omp_taskloop_grainsize.c
@@ -1,5 +1,11 @@
 // RUN: %libomp-compile-and-run
 // RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+
+// These compilers don't support the taskloop construct
+// UNSUPPORTED: gcc-4, gcc-5, icc-16
+// GCC 6 has support for taskloops, but at least 6.3.0 is crashing on this test
+// UNSUPPORTED: gcc-6
+
 /*
  * Test for taskloop
  * Method: caculate how many times the iteration space is dispatched
diff --git a/runtime/test/tasking/omp_taskloop_num_tasks.c b/runtime/test/tasking/omp_taskloop_num_tasks.c
index 2a932e3..7c3c704 100644
--- a/runtime/test/tasking/omp_taskloop_num_tasks.c
+++ b/runtime/test/tasking/omp_taskloop_num_tasks.c
@@ -1,5 +1,9 @@
 // RUN: %libomp-compile-and-run
 // RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+
+// These compilers don't support the taskloop construct
+// UNSUPPORTED: gcc-4, gcc-5, icc-16
+
 /*
  * Test for taskloop
  * Method: caculate how many times the iteration space is dispatched
diff --git a/runtime/test/worksharing/for/kmp_doacross_check.c b/runtime/test/worksharing/for/kmp_doacross_check.c
index a088965..59b61e3 100644
--- a/runtime/test/worksharing/for/kmp_doacross_check.c
+++ b/runtime/test/worksharing/for/kmp_doacross_check.c
@@ -1,4 +1,10 @@
 // RUN: %libomp-compile-and-run
+// UNSUPPORTED: gcc
+// This test is incompatible with gcc because of the explicit call to
+// __kmpc_doacross_fini().  gcc relies on an implicit call to this function
+// when the last iteration is executed inside the GOMP_loop_*_next() functions.
+// Hence, in gcc, having the explicit call leads to __kmpc_doacross_fini()
+// being called twice.
 #include <stdio.h>
 
 #define N   1000
diff --git a/runtime/test/worksharing/for/omp_doacross.c b/runtime/test/worksharing/for/omp_doacross.c
new file mode 100644
index 0000000..4187112
--- /dev/null
+++ b/runtime/test/worksharing/for/omp_doacross.c
@@ -0,0 +1,60 @@
+// RUN: %libomp-compile-and-run
+// XFAIL: gcc-4, gcc-5, clang-3.7, clang-3.8, icc-15, icc-16
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#ifndef N
+#define N 750
+#endif
+
+int test_doacross() {
+  int i, j;
+  // Allocate and zero out the matrix
+  int *m = (int *)malloc(sizeof(int) * N * N);
+  for (i = 0; i < N; ++i) {
+    for (j = 0; j < N; ++j) {
+      m[i * N + j] = 0;
+    }
+  }
+  // Have first row and column be 0, 1, 2, 3, etc.
+  for (i = 0; i < N; ++i)
+    m[i * N] = i;
+  for (j = 0; j < N; ++j)
+    m[j] = j;
+  // Perform wavefront which results in matrix:
+  // 0 1 2 3 4
+  // 1 2 3 4 5
+  // 2 3 4 5 6
+  // 3 4 5 6 7
+  // 4 5 6 7 8
+  #pragma omp parallel shared(m)
+  {
+    int row, col;
+    #pragma omp for ordered(2)
+    for (row = 1; row < N; ++row) {
+      for (col = 1; col < N; ++col) {
+        #pragma omp ordered depend(sink : row - 1, col) depend(sink : row, col - 1)
+        m[row * N + col] = m[(row - 1) * N + col] + m[row * N + (col - 1)] -
+                           m[(row - 1) * N + (col - 1)];
+        #pragma omp ordered depend(source)
+      }
+    }
+  }
+
+  // Check the bottom right element to see if iteration dependencies were held
+  int retval = (m[(N - 1) * N + N - 1] == 2 * (N - 1));
+  free(m);
+  return retval;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  int num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_doacross()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/worksharing/for/omp_for_bigbounds.c b/runtime/test/worksharing/for/omp_for_bigbounds.c
index 82f579b..901d760 100644
--- a/runtime/test/worksharing/for/omp_for_bigbounds.c
+++ b/runtime/test/worksharing/for/omp_for_bigbounds.c
@@ -1,7 +1,10 @@
 // RUN: %libomp-compile -DMY_SCHEDULE=static && %libomp-run
 // RUN: %libomp-compile -DMY_SCHEDULE=dynamic && %libomp-run
 // RUN: %libomp-compile -DMY_SCHEDULE=guided && %libomp-run
-// XFAIL: *
+
+// Only works with Intel Compiler since at least version 15.0
+// XFAIL: gcc, clang
+
 /*
  * Test that large bounds are handled properly and calculations of
  * loop iterations don't accidently overflow
diff --git a/www/index.html b/www/index.html
index 56df855..d8a2571 100644
--- a/www/index.html
+++ b/www/index.html
@@ -49,10 +49,7 @@
       </li>
       <li>
         the library that supports offload to target devices (in
-        "offload")
-      </li>
-      <li>
-        the OpenUH test-suite used to validate the OpenMP runtime
+        "libomptarget")
       </li>
     </ul>
   </p>
@@ -60,12 +57,7 @@
   <p>Support for the parts of the OpenMP 4.0 (and later) language that are not
   associated with the "target" constructs are contained in the
   "runtime" directory. Support for offloading computation via the
-  "target" directive is in the separate "offload" directory. That
-  builds a library that provides the interfaces for transferring code
-  and data to attached computational devices such as
-  the Intel&reg Xeon Phi&#0153 coprocessor or GPUs.
-  The README.txt in the "offload"
-  directory describes how to build the offload library.
+  "target" directive is in the separate "libomptarget" directory.
   </p>
 
   <p>All of the code here is <a