Upgrade OpenCL-CTS to v2023-10-10-00 am: 3cc456717b

Original change: https://android-review.googlesource.com/c/platform/external/OpenCL-CTS/+/2793473

Change-Id: Ib79c3c567b5d397ac871e7cc7d0d77d48effe760
Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml
index 1ba63ab..635e4a7 100644
--- a/.github/workflows/presubmit.yml
+++ b/.github/workflows/presubmit.yml
@@ -13,16 +13,16 @@
       fail-fast: false
       matrix:
         mainmatrix: [true]
-        os: [ubuntu-20.04, macos-latest, windows-latest]
+        os: [ubuntu-22.04, macos-latest, windows-latest]
         include:
-          - os: ubuntu-20.04
+          - os: ubuntu-22.04
             mainmatrix: true
             gl: 1
             extra: " gl"
-          - os: ubuntu-20.04
+          - os: ubuntu-22.04
             mainmatrix: false
             arch: arm
-          - os: ubuntu-20.04
+          - os: ubuntu-22.04
             mainmatrix: false
             arch: aarch64
             debug: 1
@@ -55,10 +55,10 @@
         run: ./presubmit.sh
   formatcheck:
     name: Check code format
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     steps:
       - name: Install packages
-        run: sudo apt install -y clang-format clang-format-9
+        run: sudo apt install -y clang-format clang-format-11
       - uses: actions/checkout@v3
         with:
           fetch-depth: 0
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4fce58d..6c9bbf6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,10 +103,6 @@
     if(NOT CMAKE_BUILD_TYPE MATCHES "Release|RelWithDebInfo|MinSizeRel")
         # Enable more warnings if not doing a release build.
         add_cxx_flag_if_supported(-Wall)
-        # Suppress warnings that currently trigger on the code base.
-        # This list should shrink over time when warnings are fixed.
-        add_cxx_flag_if_supported(-Wno-sometimes-uninitialized)
-        add_cxx_flag_if_supported(-Wno-sign-compare)
     endif()
     add_cxx_flag_if_supported(-Wno-narrowing)
     add_cxx_flag_if_supported(-Wno-format)
diff --git a/METADATA b/METADATA
index 41886b7..b954006 100644
--- a/METADATA
+++ b/METADATA
@@ -9,11 +9,11 @@
     type: GIT
     value: "https://github.com/KhronosGroup/OpenCL-CTS.git"
   }
-  version: "v2023-05-16-00"
+  version: "v2023-10-10-00"
   license_type: NOTICE
   last_upgrade_date {
     year: 2023
-    month: 7
-    day: 25
+    month: 10
+    day: 17
   }
 }
diff --git a/check-format.sh b/check-format.sh
index be8f9d7..b5dc0a7 100755
--- a/check-format.sh
+++ b/check-format.sh
@@ -2,7 +2,7 @@
 
 # Arg used to specify non-'origin/main' comparison branch
 ORIGIN_BRANCH=${1:-"origin/main"}
-CLANG_BINARY=${2:-"`which clang-format-9`"}
+CLANG_BINARY=${2:-"`which clang-format-11`"}
 
 # Run git-clang-format to check for violations
 CLANG_FORMAT_OUTPUT=$(git-clang-format --diff $ORIGIN_BRANCH --extensions c,cpp,h,hpp --binary $CLANG_BINARY)
diff --git a/presubmit.sh b/presubmit.sh
index 605c10b..10354ab 100755
--- a/presubmit.sh
+++ b/presubmit.sh
@@ -77,7 +77,6 @@
       -DBUILD_WSI_XLIB_SUPPORT=OFF \
       -DBUILD_WSI_XCB_SUPPORT=OFF \
       -DBUILD_WSI_WAYLAND_SUPPORT=OFF \
-      -DUSE_GAS=OFF \
       -C helper.cmake ..
 cmake --build . -j2
 
diff --git a/test_common/gl/helpers.cpp b/test_common/gl/helpers.cpp
index b9f95a9..62f6325 100644
--- a/test_common/gl/helpers.cpp
+++ b/test_common/gl/helpers.cpp
@@ -966,12 +966,13 @@
 
 #ifdef GL_VERSION_3_2
 
-#define check_gl_error() \
-{ \
-  GLenum errnom = GL_NO_ERROR;\
-  if ((errnom = glGetError()) != GL_NO_ERROR)\
-    log_error("GL Error: 0x%04X at %s:%d\n", errnom, __FILE__, __LINE__);\
-}
+#define CHECK_GL_ERROR()                                                       \
+    {                                                                          \
+        GLenum errnom = GL_NO_ERROR;                                           \
+        if ((errnom = glGetError()) != GL_NO_ERROR)                            \
+            log_error("GL Error: 0x%04X at %s:%d\n", errnom, __FILE__,         \
+                      __LINE__);                                               \
+    }
 
 const char *get_gl_vector_type( GLenum internalformat )
 {
@@ -1045,10 +1046,12 @@
 
   // Check if the renderer supports enough samples
   GLint max_samples = get_gl_max_samples(target, internalFormat);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   if (max_samples < (GLint)samples)
-    log_error("GL error: requested samples (%d) exceeds renderer max samples (%d)\n", samples, max_samples);
+      log_error("GL error: requested samples (%zu) exceeds renderer max "
+                "samples (%d)\n",
+                samples, max_samples);
 
   // Setup the GLSL program
   const GLchar *vertex_source =
@@ -1075,36 +1078,36 @@
   glShaderWrapper vertex_shader = glCreateShader(GL_VERTEX_SHADER);
   glShaderSource(vertex_shader, 1, &vertex_source, NULL);
   glCompileShader(vertex_shader);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   glShaderWrapper fragment_shader = glCreateShader(GL_FRAGMENT_SHADER);
   glShaderSource(fragment_shader, 1, &fragment_source, NULL);
   glCompileShader(fragment_shader);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   GLuint prog = glCreateProgram();
   glAttachShader(prog, vertex_shader);
   glAttachShader(prog, fragment_shader);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   glBindAttribLocation(prog, 0, "att0");
   glLinkProgram(prog);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   // Setup the FBO and texture
   glFramebufferWrapper fbo;
   glGenFramebuffers(1, &fbo);
   glBindFramebuffer(GL_FRAMEBUFFER, fbo);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   glViewport(0, 0, width, height);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   GLuint tex = 0;
   glGenTextures(1, &tex);
   glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, tex);
   glTexImage2DMultisample(GL_TEXTURE_2D_MULTISAMPLE, samples, internalFormat, width, height, fixedSampleLocations);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   GLint attachment;
   switch (internalFormat) {
@@ -1122,7 +1125,7 @@
   }
 
   glFramebufferTexture(GL_FRAMEBUFFER, attachment, tex, 0);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   GLint status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
   if (status == GL_FRAMEBUFFER_UNSUPPORTED) {
@@ -1142,22 +1145,24 @@
   // Check if the framebuffer supports enough samples
   GLint fbo_samples = 0;
   glGetIntegerv(GL_SAMPLES, &fbo_samples);
-  check_gl_error();
+  CHECK_GL_ERROR();
 
   if (fbo_samples < (GLint)samples)
-    log_error("GL Error: requested samples (%d) exceeds FBO capability (%d)\n", samples, fbo_samples);
+      log_error(
+          "GL Error: requested samples (%zu) exceeds FBO capability (%d)\n",
+          samples, fbo_samples);
 
   glUseProgram(prog);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   if (attachment != GL_DEPTH_ATTACHMENT && attachment != GL_DEPTH_STENCIL_ATTACHMENT) {
     glDisable(GL_DEPTH_TEST);
-    check_gl_error()
+    CHECK_GL_ERROR()
   }
   else {
     glEnable(GL_DEPTH_TEST);
     glDepthFunc(GL_ALWAYS);
-    check_gl_error()
+    CHECK_GL_ERROR()
   }
 
   // Setup the VBO for rendering a quad
@@ -1172,14 +1177,14 @@
   glGenBuffers(1, &vbo);
   glBindBuffer(GL_ARRAY_BUFFER, vbo);
   glBufferData(GL_ARRAY_BUFFER, sizeof(quad), quad, GL_STREAM_DRAW);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   glVertexArraysWrapper vao;
   glGenVertexArrays(1, &vao);
   glBindVertexArray(vao);
   glEnableVertexAttribArray(0);
   glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, sizeof(GLfloat)*2, 0);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   //clearing color and depth buffer
   glClearColor(0, 0, 0, 0);
@@ -1223,13 +1228,13 @@
     color += color_delta;
 
     glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-    check_gl_error();
+    CHECK_GL_ERROR();
 
     glFlush();
   }
 
   glDisable(GL_SAMPLE_MASK);
-  check_gl_error();
+  CHECK_GL_ERROR();
 
   *outTextureID = tex;
 
@@ -1306,7 +1311,9 @@
   GLint max_samples = get_gl_max_samples(target, internalFormat);
 
   if (max_samples < (GLint)samples)
-    log_error("GL error: requested samples (%d) exceeds renderer max samples (%d)\n", samples, max_samples);
+      log_error("GL error: requested samples (%zu) exceeds renderer max "
+                "samples (%d)\n",
+                samples, max_samples);
 
   // Setup the GLSL program
   const GLchar *vertex_source =
@@ -1333,36 +1340,36 @@
   glShaderWrapper vertex_shader = glCreateShader(GL_VERTEX_SHADER);
   glShaderSource(vertex_shader, 1, &vertex_source, NULL);
   glCompileShader(vertex_shader);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   glShaderWrapper fragment_shader = glCreateShader(GL_FRAGMENT_SHADER);
   glShaderSource(fragment_shader, 1, &fragment_source, NULL);
   glCompileShader(fragment_shader);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   glProgramWrapper prog = glCreateProgram();
   glAttachShader(prog, vertex_shader);
   glAttachShader(prog, fragment_shader);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   glBindAttribLocation(prog, 0, "att0");
   glLinkProgram(prog);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   // Setup the FBO and texture
   glFramebufferWrapper fbo;
   glGenFramebuffers(1, &fbo);
   glBindFramebuffer(GL_FRAMEBUFFER, fbo);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   glViewport(0, 0, width, height);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   GLuint tex = 0;
   glGenTextures(1, &tex);
   glBindTexture(GL_TEXTURE_2D_MULTISAMPLE_ARRAY, tex);
   glTexImage3DMultisample(GL_TEXTURE_2D_MULTISAMPLE_ARRAY, samples, internalFormat, width, height, total_layers, fixedSampleLocations);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   GLint attachment;
   switch (internalFormat) {
@@ -1384,12 +1391,12 @@
 
   if (attachment != GL_DEPTH_ATTACHMENT && attachment != GL_DEPTH_STENCIL_ATTACHMENT) {
     glDisable(GL_DEPTH_TEST);
-  check_gl_error()
+    CHECK_GL_ERROR()
   }
   else {
     glEnable(GL_DEPTH_TEST);
     glDepthFunc(GL_ALWAYS);
-    check_gl_error()
+    CHECK_GL_ERROR()
   }
 
   // Setup the VBO for rendering a quad
@@ -1404,18 +1411,18 @@
   glGenBuffers(1, &vbo);
   glBindBuffer(GL_ARRAY_BUFFER, vbo);
   glBufferData(GL_ARRAY_BUFFER, sizeof(quad), quad, GL_STREAM_DRAW);
-  check_gl_error()
+  CHECK_GL_ERROR()
 
   glVertexArraysWrapper vao;
   glGenVertexArrays(1, &vao);
   glBindVertexArray(vao);
   glEnableVertexAttribArray(0);
   glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, sizeof(GLfloat)*2, 0);
-    check_gl_error()
+  CHECK_GL_ERROR()
 
   for (size_t l=0; l!=total_layers; ++l) {
     glFramebufferTextureLayer(GL_FRAMEBUFFER, attachment, tex, 0, l);
-      check_gl_error()
+    CHECK_GL_ERROR()
 
     GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
     if (status == GL_FRAMEBUFFER_UNSUPPORTED) {
@@ -1435,13 +1442,15 @@
     // Check if the framebuffer supports enough samples
     GLint fbo_samples = 0;
     glGetIntegerv(GL_SAMPLES, &fbo_samples);
-    check_gl_error();
+    CHECK_GL_ERROR();
 
     if (fbo_samples < (GLint)samples)
-      log_error("GL Error: requested samples (%d) exceeds FBO capability (%d)\n", samples, fbo_samples);
+        log_error(
+            "GL Error: requested samples (%zu) exceeds FBO capability (%d)\n",
+            samples, fbo_samples);
 
     glUseProgram(prog);
-    check_gl_error()
+    CHECK_GL_ERROR()
 
     //clearing color and depth buffer
     glClearColor(0, 0, 0, 0);
@@ -1482,13 +1491,13 @@
       glUniform1f(glGetUniformLocation(prog, "depthVal"), val);
 
       glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-      check_gl_error();
+      CHECK_GL_ERROR();
 
       glFlush();
     }
 
     glDisable(GL_SAMPLE_MASK);
-    check_gl_error();
+    CHECK_GL_ERROR();
   }
 
   *outTextureID = tex;
@@ -1715,7 +1724,7 @@
         // Reverse and reorder to validate since in the
         // kernel the read_imagef() call always returns RGBA
         cl_uchar *p = (cl_uchar *)buffer;
-        for( size_t i = 0; i < (size_t)width * height; i++ )
+        for (GLsizei i = 0; i < width * height; i++)
         {
             cl_uchar uc0 = p[i * 4 + 0];
             cl_uchar uc1 = p[i * 4 + 1];
@@ -1733,7 +1742,7 @@
       // Reverse and reorder to validate since in the
       // kernel the read_imagef() call always returns RGBA
       cl_uchar *p = (cl_uchar *)buffer;
-      for( size_t i = 0; i < width * height; i++ )
+      for (GLsizei i = 0; i < width * height; i++)
       {
         cl_uchar uc0 = p[i * 4 + 0];
         cl_uchar uc1 = p[i * 4 + 1];
diff --git a/test_common/harness/compat.h b/test_common/harness/compat.h
index 4053b7e..a42f291 100644
--- a/test_common/harness/compat.h
+++ b/test_common/harness/compat.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _COMPAT_H_
-#define _COMPAT_H_
+#ifndef COMPAT_H_
+#define COMPAT_H_
 
 #if defined(_WIN32) && defined(_MSC_VER)
 #include <Windows.h>
@@ -398,4 +398,4 @@
 #define sleep(sec) Sleep((sec)*1000)
 #endif
 
-#endif // _COMPAT_H_
+#endif // COMPAT_H_
diff --git a/test_common/harness/crc32.h b/test_common/harness/crc32.h
index 65ca15e..6958701 100644
--- a/test_common/harness/crc32.h
+++ b/test_common/harness/crc32.h
@@ -15,8 +15,8 @@
 executed between Khronos and the recipient.
 ******************************************************************/
 
-#ifndef _CRC32_H_
-#define _CRC32_H_
+#ifndef CRC32_H_
+#define CRC32_H_
 
 #include <stdint.h>
 #include <stddef.h>
diff --git a/test_common/harness/mt19937.cpp b/test_common/harness/mt19937.cpp
index f5665de..2d503eb 100644
--- a/test_common/harness/mt19937.cpp
+++ b/test_common/harness/mt19937.cpp
@@ -51,6 +51,7 @@
 #include "harness/alloc.h"
 
 #ifdef __SSE2__
+#include <mutex>
 #include <emmintrin.h>
 #endif
 
@@ -107,7 +108,7 @@
     /* mag01[x] = x * MATRIX_A  for x=0,1 */
     static const cl_uint mag01[2] = { 0x0UL, MATRIX_A };
 #ifdef __SSE2__
-    static volatile int init = 0;
+    static std::once_flag init_flag;
     static union {
         __m128i v;
         cl_uint s[4];
@@ -123,8 +124,7 @@
         int kk;
 
 #ifdef __SSE2__
-        if (0 == init)
-        {
+        auto init_fn = []() {
             upper_mask.s[0] = upper_mask.s[1] = upper_mask.s[2] =
                 upper_mask.s[3] = UPPER_MASK;
             lower_mask.s[0] = lower_mask.s[1] = lower_mask.s[2] =
@@ -134,8 +134,8 @@
                 MATRIX_A;
             c0.s[0] = c0.s[1] = c0.s[2] = c0.s[3] = (cl_uint)0x9d2c5680UL;
             c1.s[0] = c1.s[1] = c1.s[2] = c1.s[3] = (cl_uint)0xefc60000UL;
-            init = 1;
-        }
+        };
+        std::call_once(init_flag, init_fn);
 #endif
 
         kk = 0;
diff --git a/test_common/harness/stringHelpers.h b/test_common/harness/stringHelpers.h
new file mode 100644
index 0000000..e1275f1
--- /dev/null
+++ b/test_common/harness/stringHelpers.h
@@ -0,0 +1,42 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#ifndef STRING_HELPERS_H
+#define STRING_HELPERS_H
+
+#include <memory>
+#include <stdexcept>
+#include <string>
+
+inline std::string concat_kernel(const char *sstr[], int num)
+{
+    std::string res;
+    for (int i = 0; i < num; i++) res += std::string(sstr[i]);
+    return res;
+}
+
+template <typename... Args>
+inline std::string str_sprintf(const std::string &str, Args... args)
+{
+    int str_size = std::snprintf(nullptr, 0, str.c_str(), args...) + 1;
+    if (str_size <= 0) throw std::runtime_error("Formatting error.");
+    size_t s = static_cast<size_t>(str_size);
+    std::unique_ptr<char[]> buffer(new char[s]);
+    std::snprintf(buffer.get(), s, str.c_str(), args...);
+    return std::string(buffer.get(), buffer.get() + s - 1);
+}
+
+#endif // STRING_HELPERS_H
diff --git a/test_common/harness/testHarness.cpp b/test_common/harness/testHarness.cpp
index 95ea816..3d743e7 100644
--- a/test_common/harness/testHarness.cpp
+++ b/test_common/harness/testHarness.cpp
@@ -835,9 +835,9 @@
         std::vector<std::thread *> threads;
         test_harness_state state = { testList, resultTestList, deviceToUse,
                                      config };
-        for (int i = 0; i < config.numWorkerThreads; i++)
+        for (unsigned i = 0; i < config.numWorkerThreads; i++)
         {
-            log_info("Spawning worker thread %i\n", i);
+            log_info("Spawning worker thread %u\n", i);
             threads.push_back(new std::thread(test_function_runner, &state));
         }
 
diff --git a/test_common/harness/typeWrappers.h b/test_common/harness/typeWrappers.h
index 50c7c93..ad11b48 100644
--- a/test_common/harness/typeWrappers.h
+++ b/test_common/harness/typeWrappers.h
@@ -145,6 +145,48 @@
 using clEventWrapper =
     wrapper_details::Wrapper<cl_event, clRetainEvent, clReleaseEvent>;
 
+class clSVMWrapper {
+    void *Ptr = nullptr;
+    cl_context Ctx = nullptr;
+
+public:
+    clSVMWrapper() = default;
+
+    clSVMWrapper(cl_context C, size_t Size,
+                 cl_svm_mem_flags F = CL_MEM_READ_WRITE)
+        : Ctx(C)
+    {
+        Ptr = clSVMAlloc(C, F, Size, 0);
+    }
+
+    clSVMWrapper &operator=(void *other) = delete;
+    clSVMWrapper(clSVMWrapper const &other) = delete;
+    clSVMWrapper &operator=(clSVMWrapper const &other) = delete;
+    clSVMWrapper(clSVMWrapper &&other)
+    {
+        Ptr = other.Ptr;
+        Ctx = other.Ctx;
+        other.Ptr = nullptr;
+        other.Ctx = nullptr;
+    }
+    clSVMWrapper &operator=(clSVMWrapper &&other)
+    {
+        Ptr = other.Ptr;
+        Ctx = other.Ctx;
+        other.Ptr = nullptr;
+        other.Ctx = nullptr;
+        return *this;
+    }
+
+    ~clSVMWrapper()
+    {
+        if (Ptr) clSVMFree(Ctx, Ptr);
+    }
+
+    void *operator()() const { return Ptr; }
+};
+
+
 class clProtectedImage {
 public:
     clProtectedImage()
diff --git a/test_conformance/SVM/CMakeLists.txt b/test_conformance/SVM/CMakeLists.txt
index 2d01a82..2ad2f82 100644
--- a/test_conformance/SVM/CMakeLists.txt
+++ b/test_conformance/SVM/CMakeLists.txt
@@ -17,4 +17,6 @@
     test_migrate.cpp
 )
 
+set_gnulike_module_compile_flags("-Wno-sometimes-uninitialized -Wno-sign-compare")
+
 include(../CMakeCommon.txt)
diff --git a/test_conformance/allocations/CMakeLists.txt b/test_conformance/allocations/CMakeLists.txt
index a404380..b603122 100644
--- a/test_conformance/allocations/CMakeLists.txt
+++ b/test_conformance/allocations/CMakeLists.txt
@@ -8,4 +8,6 @@
         allocation_utils.cpp
 )
 
+set_gnulike_module_compile_flags("-Wno-sign-compare")
+
 include(../CMakeCommon.txt)
diff --git a/test_conformance/allocations/allocation_execute.cpp b/test_conformance/allocations/allocation_execute.cpp
index 9d0e877..5a77c3a 100644
--- a/test_conformance/allocations/allocation_execute.cpp
+++ b/test_conformance/allocations/allocation_execute.cpp
@@ -79,20 +79,30 @@
         return -1;
     }
 
-    if (type == CL_MEM_OBJECT_BUFFER) {
-        log_error("Expected image object, not buffer.\n");
-        return -1;
-    } else if (type == CL_MEM_OBJECT_IMAGE2D) {
-        error = clGetImageInfo(mem, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
-        if (error) {
-            print_error(error, "clGetMemObjectInfo failed for CL_IMAGE_WIDTH.");
+    switch (type)
+    {
+        case CL_MEM_OBJECT_BUFFER:
+            log_error("Expected image object, not buffer.\n");
             return -1;
-        }
-        error = clGetImageInfo(mem, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
-        if (error) {
-            print_error(error, "clGetMemObjectInfo failed for CL_IMAGE_HEIGHT.");
-            return -1;
-        }
+        case CL_MEM_OBJECT_IMAGE2D:
+            error = clGetImageInfo(mem, CL_IMAGE_WIDTH, sizeof(width), &width,
+                                   NULL);
+            if (error)
+            {
+                print_error(error,
+                            "clGetMemObjectInfo failed for CL_IMAGE_WIDTH.");
+                return -1;
+            }
+            error = clGetImageInfo(mem, CL_IMAGE_HEIGHT, sizeof(height),
+                                   &height, NULL);
+            if (error)
+            {
+                print_error(error,
+                            "clGetMemObjectInfo failed for CL_IMAGE_HEIGHT.");
+                return -1;
+            }
+            break;
+        default: log_error("unexpected object type"); return -1;
     }
 
 
diff --git a/test_conformance/allocations/main.cpp b/test_conformance/allocations/main.cpp
index 43e8127..827072f 100644
--- a/test_conformance/allocations/main.cpp
+++ b/test_conformance/allocations/main.cpp
@@ -326,6 +326,7 @@
         else if ( strcmp( argv[i], "--help" ) == 0 || strcmp( argv[i], "-h" ) == 0 )
         {
             printUsage( argv[0] );
+            free(argList);
             return -1;
         }
 
diff --git a/test_conformance/api/test_mem_object_info.cpp b/test_conformance/api/test_mem_object_info.cpp
index 8dc8f6c..7eedec8 100644
--- a/test_conformance/api/test_mem_object_info.cpp
+++ b/test_conformance/api/test_mem_object_info.cpp
@@ -217,6 +217,9 @@
             // Create a buffer object to test against.
             bufferObject = clCreateBuffer( context, bufferFlags[ i ], addressAlign * 4, NULL, &error );
             test_error( error, "Unable to create buffer to test with" );
+            void *ptr;
+            TEST_MEM_OBJECT_PARAM(bufferObject, CL_MEM_HOST_PTR, ptr, NULL,
+                                  "host pointer", "%p", void *)
         }
 
         // Perform buffer object queries.
diff --git a/test_conformance/api/test_null_buffer_arg.cpp b/test_conformance/api/test_null_buffer_arg.cpp
index 75bdd47..83fcb63 100644
--- a/test_conformance/api/test_null_buffer_arg.cpp
+++ b/test_conformance/api/test_null_buffer_arg.cpp
@@ -64,16 +64,21 @@
     cl_int status;
     const char *typestr;
 
-    if (type == NON_NULL_PATH) {
-        status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &test_buf);
-        typestr = "non-NULL";
-    } else if (type == ADDROF_NULL_PATH) {
-        test_buf = NULL;
-        status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &test_buf);
-        typestr = "&NULL";
-    } else if (type == NULL_PATH) {
-        status = clSetKernelArg(kernel, 0, sizeof(cl_mem), NULL);
-        typestr = "NULL";
+    switch (type)
+    {
+        case NON_NULL_PATH:
+            status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &test_buf);
+            typestr = "non-NULL";
+            break;
+        case ADDROF_NULL_PATH:
+            test_buf = NULL;
+            status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &test_buf);
+            typestr = "&NULL";
+            break;
+        case NULL_PATH:
+            status = clSetKernelArg(kernel, 0, sizeof(cl_mem), NULL);
+            typestr = "NULL";
+            break;
     }
 
     log_info("Testing setKernelArgs with %s buffer.\n", typestr);
diff --git a/test_conformance/api/test_queries.cpp b/test_conformance/api/test_queries.cpp
index fa5c227..f074010 100644
--- a/test_conformance/api/test_queries.cpp
+++ b/test_conformance/api/test_queries.cpp
@@ -644,6 +644,13 @@
     }
     log_info( "\tReported device profile: %s \n", profile );
 
+    if (strcmp(profile, "FULL_PROFILE") == 0 && compilerAvail != CL_TRUE)
+    {
+        log_error("ERROR: Returned profile of device is FULL , but "
+                  "CL_DEVICE_COMPILER_AVAILABLE is not CL_TRUE as required by "
+                  "OpenCL 1.2!");
+        return -1;
+    }
 
     return 0;
 }
@@ -799,8 +806,8 @@
         test_error(error, "clFinish failed");
 
         if (max_dimensions == 2) {
-            return 0;
             free(source);
+            return 0;
         }
 
         local[1]--; local[2]++;
diff --git a/test_conformance/api/test_queue_properties.cpp b/test_conformance/api/test_queue_properties.cpp
index 62d0a73..768bd5d 100644
--- a/test_conformance/api/test_queue_properties.cpp
+++ b/test_conformance/api/test_queue_properties.cpp
@@ -107,8 +107,9 @@
 
     clProgramWrapper program;
     clKernelWrapper kernel;
-    cl_queue_properties_khr device_props = 0;
-    cl_queue_properties_khr queue_prop_def[] = { CL_QUEUE_PROPERTIES, 0, 0 };
+    cl_command_queue_properties device_props = 0;
+    cl_command_queue_properties queue_prop_def[] = { CL_QUEUE_PROPERTIES, 0,
+                                                     0 };
 
     // Query extension
     if (!is_extension_available(deviceID, "cl_khr_create_command_queue"))
diff --git a/test_conformance/atomics/test_indexed_cases.cpp b/test_conformance/atomics/test_indexed_cases.cpp
index 7da2dfa..ce0410b 100644
--- a/test_conformance/atomics/test_indexed_cases.cpp
+++ b/test_conformance/atomics/test_indexed_cases.cpp
@@ -13,6 +13,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
+#include <memory>
+
 #include "testBase.h"
 #include "harness/conversions.h"
 
@@ -226,13 +229,13 @@
              (int)global_threads[0], (int)local_threads[0]);
 
     // Allocate our storage
-    cl_mem bin_counters =
+    clMemWrapper bin_counters =
         clCreateBuffer(context, CL_MEM_READ_WRITE,
                        sizeof(cl_int) * number_of_bins, NULL, NULL);
-    cl_mem bins = clCreateBuffer(
+    clMemWrapper bins = clCreateBuffer(
         context, CL_MEM_READ_WRITE,
         sizeof(cl_int) * number_of_bins * max_counts_per_bin, NULL, NULL);
-    cl_mem bin_assignments =
+    clMemWrapper bin_assignments =
         clCreateBuffer(context, CL_MEM_READ_ONLY,
                        sizeof(cl_int) * number_of_items, NULL, NULL);
 
@@ -253,7 +256,7 @@
     }
 
     // Initialize our storage
-    cl_int *l_bin_counts = (cl_int *)malloc(sizeof(cl_int) * number_of_bins);
+    std::unique_ptr<cl_int[]> l_bin_counts(new cl_int[number_of_bins]);
     if (!l_bin_counts)
     {
         log_error("add_index_bin_test FAILED to allocate initial values for "
@@ -263,8 +266,8 @@
     int i;
     for (i = 0; i < number_of_bins; i++) l_bin_counts[i] = 0;
     err = clEnqueueWriteBuffer(queue, bin_counters, true, 0,
-                               sizeof(cl_int) * number_of_bins, l_bin_counts, 0,
-                               NULL, NULL);
+                               sizeof(cl_int) * number_of_bins,
+                               l_bin_counts.get(), 0, NULL, NULL);
     if (err)
     {
         log_error("add_index_bin_test FAILED to set initial values for "
@@ -273,8 +276,8 @@
         return -1;
     }
 
-    cl_int *values =
-        (cl_int *)malloc(sizeof(cl_int) * number_of_bins * max_counts_per_bin);
+    std::unique_ptr<cl_int[]> values(
+        new cl_int[number_of_bins * max_counts_per_bin]);
     if (!values)
     {
         log_error(
@@ -285,7 +288,7 @@
     err = clEnqueueWriteBuffer(queue, bins, true, 0,
                                sizeof(cl_int) * number_of_bins
                                    * max_counts_per_bin,
-                               values, 0, NULL, NULL);
+                               values.get(), 0, NULL, NULL);
     if (err)
     {
         log_error(
@@ -293,10 +296,8 @@
             err);
         return -1;
     }
-    free(values);
 
-    cl_int *l_bin_assignments =
-        (cl_int *)malloc(sizeof(cl_int) * number_of_items);
+    std::unique_ptr<cl_int[]> l_bin_assignments(new cl_int[number_of_items]);
     if (!l_bin_assignments)
     {
         log_error("add_index_bin_test FAILED to allocate initial values for "
@@ -326,7 +327,7 @@
     }
     err = clEnqueueWriteBuffer(queue, bin_assignments, true, 0,
                                sizeof(cl_int) * number_of_items,
-                               l_bin_assignments, 0, NULL, NULL);
+                               l_bin_assignments.get(), 0, NULL, NULL);
     if (err)
     {
         log_error("add_index_bin_test FAILED to set initial values for "
@@ -355,8 +356,8 @@
         return -1;
     }
 
-    cl_int *final_bin_assignments =
-        (cl_int *)malloc(sizeof(cl_int) * number_of_bins * max_counts_per_bin);
+    std::unique_ptr<cl_int[]> final_bin_assignments(
+        new cl_int[number_of_bins * max_counts_per_bin]);
     if (!final_bin_assignments)
     {
         log_error("add_index_bin_test FAILED to allocate initial values for "
@@ -366,15 +367,14 @@
     err = clEnqueueReadBuffer(queue, bins, true, 0,
                               sizeof(cl_int) * number_of_bins
                                   * max_counts_per_bin,
-                              final_bin_assignments, 0, NULL, NULL);
+                              final_bin_assignments.get(), 0, NULL, NULL);
     if (err)
     {
         log_error("add_index_bin_test FAILED to read back bins: %d\n", err);
         return -1;
     }
 
-    cl_int *final_bin_counts =
-        (cl_int *)malloc(sizeof(cl_int) * number_of_bins);
+    std::unique_ptr<cl_int[]> final_bin_counts(new cl_int[number_of_bins]);
     if (!final_bin_counts)
     {
         log_error("add_index_bin_test FAILED to allocate initial values for "
@@ -382,8 +382,8 @@
         return -1;
     }
     err = clEnqueueReadBuffer(queue, bin_counters, true, 0,
-                              sizeof(cl_int) * number_of_bins, final_bin_counts,
-                              0, NULL, NULL);
+                              sizeof(cl_int) * number_of_bins,
+                              final_bin_counts.get(), 0, NULL, NULL);
     if (err)
     {
         log_error("add_index_bin_test FAILED to read back bin_counters: %d\n",
@@ -460,13 +460,7 @@
             errors++;
         }
     }
-    free(l_bin_counts);
-    free(l_bin_assignments);
-    free(final_bin_assignments);
-    free(final_bin_counts);
-    clReleaseMemObject(bin_counters);
-    clReleaseMemObject(bins);
-    clReleaseMemObject(bin_assignments);
+
     if (errors == 0)
     {
         log_info("add_index_bin_test passed. Each item was put in the correct "
diff --git a/test_conformance/basic/CMakeLists.txt b/test_conformance/basic/CMakeLists.txt
index dde3311..684a7d1 100644
--- a/test_conformance/basic/CMakeLists.txt
+++ b/test_conformance/basic/CMakeLists.txt
@@ -2,7 +2,7 @@
 
 set(${MODULE_NAME}_SOURCES
     main.cpp
-    test_fpmath_float.cpp
+    test_fpmath.cpp
     test_intmath.cpp
     test_hiloeo.cpp test_local.cpp test_pointercast.cpp
     test_if.cpp test_loop.cpp
@@ -11,7 +11,7 @@
     test_multireadimageonefmt.cpp test_multireadimagemultifmt.cpp
     test_imagedim.cpp
     test_vloadstore.cpp
-    test_int2float.cpp test_float2int.cpp
+    test_int2fp.cpp
     test_createkernelsinprogram.cpp
     test_hostptr.cpp
     test_explicit_s2v.cpp
@@ -52,14 +52,12 @@
     test_kernel_call_kernel_function.cpp
     test_local_kernel_scope.cpp
     test_progvar.cpp
-    test_wg_barrier.cpp
     test_global_linear_id.cpp
     test_local_linear_id.cpp
     test_enqueued_local_size.cpp
     test_simple_image_pitch.cpp
     test_get_linear_ids.cpp
     test_rw_image_access_qualifier.cpp
-    test_wg_barrier.cpp
     test_enqueued_local_size.cpp
     test_global_linear_id.cpp
     test_local_linear_id.cpp
@@ -70,6 +68,6 @@
     list(APPEND ${MODULE_NAME}_SOURCES test_queue_priority.cpp)
 endif(APPLE)
 
-set_gnulike_module_compile_flags("-Wno-unused-but-set-variable")
+set_gnulike_module_compile_flags("-Wno-sign-compare")
 
 include(../CMakeCommon.txt)
diff --git a/test_conformance/basic/main.cpp b/test_conformance/basic/main.cpp
index 86c3cec..d1901f9 100644
--- a/test_conformance/basic/main.cpp
+++ b/test_conformance/basic/main.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2023 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,14 +22,15 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
+#include <CL/cl_half.h>
+
 #include "harness/testHarness.h"
 #include "procs.h"
 
 test_definition test_list[] = {
     ADD_TEST(hostptr),
-    ADD_TEST(fpmath_float),
-    ADD_TEST(fpmath_float2),
-    ADD_TEST(fpmath_float4),
+    ADD_TEST(fpmath),
     ADD_TEST(intmath_int),
     ADD_TEST(intmath_int2),
     ADD_TEST(intmath_int4),
@@ -58,8 +59,8 @@
     ADD_TEST(image_r8),
     ADD_TEST(barrier),
     ADD_TEST_VERSION(wg_barrier, Version(2, 0)),
-    ADD_TEST(int2float),
-    ADD_TEST(float2int),
+    ADD_TEST(int2fp),
+    ADD_TEST(fp2int),
     ADD_TEST(imagereadwrite),
     ADD_TEST(imagereadwrite3d),
     ADD_TEST(readimage3d),
@@ -155,7 +156,7 @@
     ADD_TEST(simple_read_image_pitch),
     ADD_TEST(simple_write_image_pitch),
 
-#if defined( __APPLE__ )
+#if defined(__APPLE__)
     ADD_TEST(queue_priority),
 #endif
 
@@ -164,9 +165,35 @@
 };
 
 const int test_num = ARRAY_SIZE( test_list );
+cl_half_rounding_mode halfRoundingMode = CL_HALF_RTE;
+
+test_status InitCL(cl_device_id device)
+{
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        const cl_device_fp_config fpConfigHalf =
+            get_default_rounding_mode(device, CL_DEVICE_HALF_FP_CONFIG);
+        if ((fpConfigHalf & CL_FP_ROUND_TO_NEAREST) != 0)
+        {
+            halfRoundingMode = CL_HALF_RTE;
+        }
+        else if ((fpConfigHalf & CL_FP_ROUND_TO_ZERO) != 0)
+        {
+            halfRoundingMode = CL_HALF_RTZ;
+        }
+        else
+        {
+            log_error("Error while acquiring half rounding mode");
+            return TEST_FAIL;
+        }
+    }
+
+    return TEST_PASS;
+}
 
 int main(int argc, const char *argv[])
 {
-    return runTestHarness(argc, argv, test_num, test_list, false, 0);
+    return runTestHarnessWithCheck(argc, argv, test_num, test_list, false, 0,
+                                   InitCL);
 }
 
diff --git a/test_conformance/basic/procs.h b/test_conformance/basic/procs.h
index c14340d..b685ecd 100644
--- a/test_conformance/basic/procs.h
+++ b/test_conformance/basic/procs.h
@@ -1,6 +1,6 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
+// Copyright (c) 2023 The Khronos Group Inc.
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -13,6 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "harness/kernelHelpers.h"
 #include "harness/testHarness.h"
 #include "harness/errorHelpers.h"
@@ -21,9 +22,8 @@
 #include "harness/rounding_mode.h"
 
 extern int      test_hostptr(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int      test_fpmath_float(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int      test_fpmath_float2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int      test_fpmath_float4(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_fpmath(cl_device_id deviceID, cl_context context,
+                       cl_command_queue queue, int num_elements);
 extern int      test_intmath_int(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_intmath_int2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_intmath_int4(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
@@ -52,8 +52,10 @@
 extern int      test_simplebarrier(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_barrier(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_wg_barrier(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int      test_int2float(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int      test_float2int(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_int2fp(cl_device_id deviceID, cl_context context,
+                       cl_command_queue queue, int num_elements);
+extern int test_fp2int(cl_device_id deviceID, cl_context context,
+                       cl_command_queue queue, int num_elements);
 extern int      test_imagearraycopy(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_imagearraycopy3d(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_imagereadwrite(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
diff --git a/test_conformance/basic/test_astype.cpp b/test_conformance/basic/test_astype.cpp
index 7281f90..45669a7 100644
--- a/test_conformance/basic/test_astype.cpp
+++ b/test_conformance/basic/test_astype.cpp
@@ -14,62 +14,39 @@
 // limitations under the License.
 //
 #include "harness/compat.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <limits.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-
-#include "procs.h"
 #include "harness/conversions.h"
+#include "harness/stringHelpers.h"
 #include "harness/typeWrappers.h"
 
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <vector>
 
-static const char *astype_kernel_pattern =
-"%s\n"
+#include "procs.h"
+
+// clang-format off
+
+static char extension[128] = { 0 };
+static char strLoad[128] = { 0 };
+static char strStore[128] = { 0 };
+static const char *regLoad = "as_%s%s(src[tid]);\n";
+static const char *v3Load = "as_%s%s(vload3(tid,(__global %s*)src));\n";
+static const char *regStore = "dst[tid] = tmp;\n";
+static const char *v3Store = "vstore3(tmp, tid, (__global %s*)dst);\n";
+
+static const char* astype_kernel_pattern[] = {
+extension,
 "__kernel void test_fn( __global %s%s *src, __global %s%s *dst )\n"
 "{\n"
-"    int tid = get_global_id( 0 );\n"
-"    %s%s tmp = as_%s%s( src[ tid ] );\n"
-"   dst[ tid ] = tmp;\n"
-"}\n";
+"    int tid = get_global_id( 0 );\n",
+"    %s%s tmp = ", strLoad,
+"    ", strStore,
+"}\n"};
 
-static const char *astype_kernel_pattern_V3srcV3dst =
-"%s\n"
-"__kernel void test_fn( __global %s *src, __global %s *dst )\n"
-"{\n"
-"    int tid = get_global_id( 0 );\n"
-"    %s%s tmp = as_%s%s( vload3(tid,src) );\n"
-"   vstore3(tmp,tid,dst);\n"
-"}\n";
-// in the printf, remove the third and fifth argument, each of which
-// should be a "3", when copying from the printf for astype_kernel_pattern
-
-static const char *astype_kernel_pattern_V3dst =
-"%s\n"
-"__kernel void test_fn( __global %s%s *src, __global %s *dst )\n"
-"{\n"
-"    int tid = get_global_id( 0 );\n"
-"    %s3 tmp = as_%s3( src[ tid ] );\n"
-"   vstore3(tmp,tid,dst);\n"
-"}\n";
-// in the printf, remove the fifth argument, which
-// should be a "3", when copying from the printf for astype_kernel_pattern
-
-
-static const char *astype_kernel_pattern_V3src =
-"%s\n"
-"__kernel void test_fn( __global %s *src, __global %s%s *dst )\n"
-"{\n"
-"    int tid = get_global_id( 0 );\n"
-"    %s%s tmp = as_%s%s( vload3(tid,src) );\n"
-"   dst[ tid ] = tmp;\n"
-"}\n";
-// in the printf, remove the third argument, which
-// should be a "3", when copying from the printf for astype_kernel_pattern
-
+// clang-format on
 
 int test_astype_set( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType inVecType, ExplicitType outVecType,
                     unsigned int vecSize, unsigned int outVecSize,
@@ -81,68 +58,60 @@
     clKernelWrapper kernel;
     clMemWrapper streams[ 2 ];
 
-    char programSrc[ 10240 ];
     size_t threads[ 1 ], localThreads[ 1 ];
     size_t typeSize = get_explicit_type_size( inVecType );
     size_t outTypeSize = get_explicit_type_size(outVecType);
     char sizeNames[][ 3 ] = { "", "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
-    MTdata d;
+    MTdataHolder d(gRandomSeed);
 
+    std::ostringstream sstr;
+    if (outVecType == kDouble || inVecType == kDouble)
+        sstr << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
 
+    if (outVecType == kHalf || inVecType == kHalf)
+        sstr << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
 
-    // Create program
-    if(outVecSize == 3 && vecSize == 3) {
-        // astype_kernel_pattern_V3srcV3dst
-        sprintf( programSrc, astype_kernel_pattern_V3srcV3dst,
-                (outVecType == kDouble || inVecType == kDouble) ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
-                get_explicit_type_name( inVecType ), // sizeNames[ vecSize ],
-                get_explicit_type_name( outVecType ), // sizeNames[ outVecSize ],
-                get_explicit_type_name( outVecType ), sizeNames[ outVecSize ],
-                get_explicit_type_name( outVecType ), sizeNames[ outVecSize ] );
-    } else if(outVecSize == 3) {
-        // astype_kernel_pattern_V3dst
-        sprintf( programSrc, astype_kernel_pattern_V3dst,
-                (outVecType == kDouble || inVecType == kDouble) ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
-                get_explicit_type_name( inVecType ), sizeNames[ vecSize ],
-                get_explicit_type_name( outVecType ),
-                get_explicit_type_name( outVecType ),
-                get_explicit_type_name( outVecType ));
+    strcpy(extension, sstr.str().c_str());
 
-    } else if(vecSize == 3) {
-        // astype_kernel_pattern_V3src
-        sprintf( programSrc, astype_kernel_pattern_V3src,
-                (outVecType == kDouble || inVecType == kDouble) ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
-                get_explicit_type_name( inVecType ),// sizeNames[ vecSize ],
-                get_explicit_type_name( outVecType ), sizeNames[ outVecSize ],
-                get_explicit_type_name( outVecType ), sizeNames[ outVecSize ],
-                get_explicit_type_name( outVecType ), sizeNames[ outVecSize ]);
-    } else {
-        sprintf( programSrc, astype_kernel_pattern,
-                (outVecType == kDouble || inVecType == kDouble) ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
-                get_explicit_type_name( inVecType ), sizeNames[ vecSize ],
-                get_explicit_type_name( outVecType ), sizeNames[ outVecSize ],
-                get_explicit_type_name( outVecType ), sizeNames[ outVecSize ],
-                get_explicit_type_name( outVecType ), sizeNames[ outVecSize ]);
-    }
+    if (vecSize == 3)
+        std::snprintf(strLoad, sizeof(strLoad), v3Load,
+                      get_explicit_type_name(outVecType), sizeNames[outVecSize],
+                      get_explicit_type_name(inVecType));
+    else
+        std::snprintf(strLoad, sizeof(strLoad), regLoad,
+                      get_explicit_type_name(outVecType),
+                      sizeNames[outVecSize]);
 
-    const char *ptr = programSrc;
+    if (outVecSize == 3)
+        std::snprintf(strStore, sizeof(strStore), v3Store,
+                      get_explicit_type_name(outVecType));
+    else
+        std::snprintf(strStore, sizeof(strStore), "%s", regStore);
+
+    auto str =
+        concat_kernel(astype_kernel_pattern,
+                      sizeof(astype_kernel_pattern) / sizeof(const char *));
+    std::string kernelSource =
+        str_sprintf(str, get_explicit_type_name(inVecType), sizeNames[vecSize],
+                    get_explicit_type_name(outVecType), sizeNames[outVecSize],
+                    get_explicit_type_name(outVecType), sizeNames[outVecSize]);
+
+    const char *ptr = kernelSource.c_str();
     error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" );
     test_error( error, "Unable to create testing kernel" );
 
-
     // Create some input values
     size_t inBufferSize = sizeof(char)* numElements * get_explicit_type_size( inVecType ) * vecSize;
-    char *inBuffer = (char*)malloc( inBufferSize );
+    std::vector<char> inBuffer(inBufferSize);
     size_t outBufferSize = sizeof(char)* numElements * get_explicit_type_size( outVecType ) *outVecSize;
-    char *outBuffer = (char*)malloc( outBufferSize );
+    std::vector<char> outBuffer(outBufferSize);
 
-    d = init_genrand( gRandomSeed );
-    generate_random_data( inVecType, numElements * vecSize,
-                         d, inBuffer );
-    free_mtdata(d); d = NULL;
+    generate_random_data(inVecType, numElements * vecSize, d,
+                         &inBuffer.front());
 
     // Create I/O streams and set arguments
-    streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, inBufferSize, inBuffer, &error );
+    streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, inBufferSize,
+                                &inBuffer.front(), &error);
     test_error( error, "Unable to create I/O stream" );
     streams[ 1 ] = clCreateBuffer( context, CL_MEM_READ_WRITE, outBufferSize, NULL, &error );
     test_error( error, "Unable to create I/O stream" );
@@ -161,15 +130,15 @@
     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
     test_error( error, "Unable to run kernel" );
 
-
     // Get the results and compare
     // The beauty is that astype is supposed to return the bit pattern as a different type, which means
     // the output should have the exact same bit pattern as the input. No interpretation necessary!
-    error = clEnqueueReadBuffer( queue, streams[ 1 ], CL_TRUE, 0, outBufferSize, outBuffer, 0, NULL, NULL );
+    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, outBufferSize,
+                                &outBuffer.front(), 0, NULL, NULL);
     test_error( error, "Unable to read results" );
 
-    char *expected = inBuffer;
-    char *actual = outBuffer;
+    char *expected = &inBuffer.front();
+    char *actual = &outBuffer.front();
     size_t compSize = typeSize*vecSize;
     if(outTypeSize*outVecSize < compSize) {
         compSize = outTypeSize*outVecSize;
@@ -178,8 +147,6 @@
     if(outVecSize == 4 && vecSize == 3)
     {
         // as_type4(vec3) should compile but produce undefined results??
-        free(inBuffer);
-        free(outBuffer);
         return 0;
     }
 
@@ -188,8 +155,6 @@
         // as_typen(vecm) should compile and run but produce
         // implementation-defined results for m != n
         // and n*sizeof(type) = sizeof(vecm)
-        free(inBuffer);
-        free(outBuffer);
         return 0;
     }
 
@@ -203,17 +168,14 @@
                       GetDataVectorString( expected, typeSize, vecSize, expectedString ),
                       GetDataVectorString( actual, typeSize, vecSize, actualString ) );
             log_error("Src is :\n%s\n----\n%d threads %d localthreads\n",
-                      programSrc, (int)threads[0],(int) localThreads[0]);
-            free(inBuffer);
-            free(outBuffer);
+                      kernelSource.c_str(), (int)threads[0],
+                      (int)localThreads[0]);
             return 1;
         }
         expected += typeSize * vecSize;
         actual += outTypeSize * outVecSize;
     }
 
-    free(inBuffer);
-    free(outBuffer);
     return 0;
 }
 
@@ -223,31 +185,39 @@
     // legal in OpenCL 1.0, the result is dependent on the device it runs on, which means there's no actual way
     // for us to verify what is "valid". So the only thing we can test are types that match in size independent
     // of the element count (char -> uchar, etc)
-    ExplicitType vecTypes[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
-    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
+    const std::vector<ExplicitType> vecTypes = { kChar,   kUChar, kShort,
+                                                 kUShort, kInt,   kUInt,
+                                                 kLong,   kULong, kFloat,
+                                                 kHalf,   kDouble };
+    const unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
     unsigned int inTypeIdx, outTypeIdx, sizeIdx, outSizeIdx;
     size_t inTypeSize, outTypeSize;
     int error = 0;
 
-    for( inTypeIdx = 0; vecTypes[ inTypeIdx ] != kNumExplicitTypes; inTypeIdx++ )
+    bool fp16Support = is_extension_available(device, "cl_khr_fp16");
+    bool fp64Support = is_extension_available(device, "cl_khr_fp64");
+
+    auto skip_type = [&](ExplicitType et) {
+        if ((et == kLong || et == kULong) && !gHasLong)
+            return true;
+        else if (et == kDouble && !fp64Support)
+            return true;
+        else if (et == kHalf && !fp16Support)
+            return true;
+        return false;
+    };
+
+    for (inTypeIdx = 0; inTypeIdx < vecTypes.size(); inTypeIdx++)
     {
         inTypeSize = get_explicit_type_size(vecTypes[inTypeIdx]);
 
-        if( vecTypes[ inTypeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) )
-            continue;
+        if (skip_type(vecTypes[inTypeIdx])) continue;
 
-        if (( vecTypes[ inTypeIdx ] == kLong || vecTypes[ inTypeIdx ] == kULong ) && !gHasLong )
-            continue;
-
-        for( outTypeIdx = 0; vecTypes[ outTypeIdx ] != kNumExplicitTypes; outTypeIdx++ )
+        for (outTypeIdx = 0; outTypeIdx < vecTypes.size(); outTypeIdx++)
         {
             outTypeSize = get_explicit_type_size(vecTypes[outTypeIdx]);
-            if( vecTypes[ outTypeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) ) {
-                continue;
-            }
 
-            if (( vecTypes[ outTypeIdx ] == kLong || vecTypes[ outTypeIdx ] == kULong ) && !gHasLong )
-                continue;
+            if (skip_type(vecTypes[outTypeIdx])) continue;
 
             // change this check
             if( inTypeIdx == outTypeIdx ) {
@@ -259,7 +229,6 @@
 
             for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ )
             {
-
                 for(outSizeIdx = 0; vecSizes[outSizeIdx] != 0; outSizeIdx++)
                 {
                     if(vecSizes[sizeIdx]*inTypeSize !=
@@ -268,10 +237,7 @@
                         continue;
                     }
                     error += test_astype_set( device, context, queue, vecTypes[ inTypeIdx ], vecTypes[ outTypeIdx ], vecSizes[ sizeIdx ], vecSizes[outSizeIdx], n_elems );
-
-
                 }
-
             }
             if(get_explicit_type_size(vecTypes[inTypeIdx]) ==
                get_explicit_type_size(vecTypes[outTypeIdx])) {
diff --git a/test_conformance/basic/test_async_copy.cpp b/test_conformance/basic/test_async_copy.cpp
index a537c8f..bb529bc 100644
--- a/test_conformance/basic/test_async_copy.cpp
+++ b/test_conformance/basic/test_async_copy.cpp
@@ -20,8 +20,7 @@
 #include <string.h>
 #include <sys/types.h>
 #include <sys/stat.h>
-
-
+#include <vector>
 
 #include "procs.h"
 #include "harness/conversions.h"
@@ -86,8 +85,7 @@
     clKernelWrapper kernel;
     clMemWrapper streams[ 2 ];
     size_t threads[ 1 ], localThreads[ 1 ];
-    void *inBuffer, *outBuffer;
-    MTdata d;
+    MTdataHolder d(gRandomSeed);
     char vecNameString[64]; vecNameString[0] = 0;
     if (vecSize == 1)
         sprintf(vecNameString, "%s", get_explicit_type_name(vecType));
@@ -109,9 +107,15 @@
     char programSource[4096]; programSource[0]=0;
     char *programPtr;
 
-    sprintf(programSource, kernelCode,
-            vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
-            vecNameString, vecNameString, vecNameString, vecNameString, get_explicit_type_name(vecType), vecNameString, vecNameString);
+    std::string extStr = "";
+    if (vecType == kDouble)
+        extStr = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable";
+    else if (vecType == kHalf)
+        extStr = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable";
+
+    sprintf(programSource, kernelCode, extStr.c_str(), vecNameString,
+            vecNameString, vecNameString, vecNameString,
+            get_explicit_type_name(vecType), vecNameString, vecNameString);
     //log_info("program: %s\n", programSource);
     programPtr = programSource;
 
@@ -150,9 +154,10 @@
     size_t globalBufferSize = numberOfLocalWorkgroups*localBufferSize;
     size_t globalWorkgroupSize = numberOfLocalWorkgroups*localWorkgroupSize;
 
-    inBuffer = (void*)malloc(globalBufferSize);
-    outBuffer = (void*)malloc(globalBufferSize);
-    memset(outBuffer, 0, globalBufferSize);
+    std::vector<unsigned char> inBuffer(globalBufferSize);
+    std::vector<unsigned char> outBuffer(globalBufferSize);
+
+    outBuffer.assign(globalBufferSize, 0);
 
     cl_int copiesPerWorkItemInt, copiesPerWorkgroup;
     copiesPerWorkItemInt = (int)numberOfCopiesPerWorkitem;
@@ -164,13 +169,15 @@
     threads[0] = globalWorkgroupSize;
     localThreads[0] = localWorkgroupSize;
 
-    d = init_genrand( gRandomSeed );
-    generate_random_data( vecType, globalBufferSize/get_explicit_type_size(vecType), d, inBuffer );
-    free_mtdata(d); d = NULL;
+    generate_random_data(vecType,
+                         globalBufferSize / get_explicit_type_size(vecType), d,
+                         &inBuffer.front());
 
-    streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, globalBufferSize, inBuffer, &error );
+    streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, globalBufferSize,
+                                &inBuffer.front(), &error);
     test_error( error, "Unable to create input buffer" );
-    streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, globalBufferSize, outBuffer, &error );
+    streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, globalBufferSize,
+                                &outBuffer.front(), &error);
     test_error( error, "Unable to create output buffer" );
 
     error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[ 0 ] );
@@ -189,16 +196,18 @@
     test_error( error, "Unable to queue kernel" );
 
     // Read
-    error = clEnqueueReadBuffer( queue, streams[ 1 ], CL_TRUE, 0, globalBufferSize, outBuffer, 0, NULL, NULL );
+    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, globalBufferSize,
+                                &outBuffer.front(), 0, NULL, NULL);
     test_error( error, "Unable to read results" );
 
     // Verify
     int failuresPrinted = 0;
-    if( memcmp( inBuffer, outBuffer, globalBufferSize ) != 0 )
+    if (memcmp(&inBuffer.front(), &outBuffer.front(), globalBufferSize) != 0)
     {
         size_t typeSize = get_explicit_type_size(vecType)* vecSize;
-        unsigned char * inchar = (unsigned char*)inBuffer;
-        unsigned char * outchar = (unsigned char*)outBuffer;
+        unsigned char *inchar = static_cast<unsigned char *>(&inBuffer.front());
+        unsigned char *outchar =
+            static_cast<unsigned char *>(&outBuffer.front());
         for (int i=0; i< (int)globalBufferSize; i+=(int)elementSize) {
             if (memcmp( ((char *)inchar)+i, ((char *)outchar)+i, typeSize) != 0 )
             {
@@ -226,26 +235,29 @@
         }
     }
 
-    free(inBuffer);
-    free(outBuffer);
-
     return failuresPrinted ? -1 : 0;
 }
 
 int test_copy_all_types(cl_device_id deviceID, cl_context context, cl_command_queue queue, const char *kernelCode) {
-    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
+    const std::vector<ExplicitType> vecType = { kChar,  kUChar, kShort, kUShort,
+                                                kInt,   kUInt,  kLong,  kULong,
+                                                kFloat, kHalf,  kDouble };
     unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
     unsigned int size, typeIndex;
 
     int errors = 0;
 
-    for( typeIndex = 0; vecType[ typeIndex ] != kNumExplicitTypes; typeIndex++ )
-    {
-        if( vecType[ typeIndex ] == kDouble && !is_extension_available( deviceID, "cl_khr_fp64" ) )
-            continue;
+    bool fp16Support = is_extension_available(deviceID, "cl_khr_fp16");
+    bool fp64Support = is_extension_available(deviceID, "cl_khr_fp64");
 
+    for (typeIndex = 0; typeIndex < vecType.size(); typeIndex++)
+    {
         if (( vecType[ typeIndex ] == kLong || vecType[ typeIndex ] == kULong ) && !gHasLong )
             continue;
+        else if (vecType[typeIndex] == kDouble && !fp64Support)
+            continue;
+        else if (vecType[typeIndex] == kHalf && !fp16Support)
+            continue;
 
         for( size = 0; vecSizes[ size ] != 0; size++ )
         {
@@ -259,9 +271,6 @@
     return 0;
 }
 
-
-
-
 int test_async_copy_global_to_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return test_copy_all_types( deviceID, context, queue, async_global_to_local_kernel );
diff --git a/test_conformance/basic/test_async_copy2D.cpp b/test_conformance/basic/test_async_copy2D.cpp
index bf3f155..11ef84b 100644
--- a/test_conformance/basic/test_async_copy2D.cpp
+++ b/test_conformance/basic/test_async_copy2D.cpp
@@ -27,17 +27,25 @@
 
 static const char *async_global_to_local_kernel2D = R"OpenCLC(
 #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
-%s // optional pragma string
 
-__kernel void test_fn(const __global %s *src, __global %s *dst,
-                      __local %s *localBuffer, int numElementsPerLine,
+#define STRUCT_SIZE %d
+typedef struct __attribute__((packed))
+{
+    uchar byte[STRUCT_SIZE];
+} VarSizeStruct __attribute__((aligned(1)));
+
+
+__kernel void test_fn(const __global VarSizeStruct *src, __global VarSizeStruct *dst,
+                      __local VarSizeStruct *localBuffer, int numElementsPerLine,
                       int lineCopiesPerWorkgroup, int lineCopiesPerWorkItem,
                       int srcStride, int dstStride) {
   // Zero the local storage first
   for (int i = 0; i < lineCopiesPerWorkItem; i++) {
     for (int j = 0; j < numElementsPerLine; j++) {
       const int index = (get_local_id(0) * lineCopiesPerWorkItem + i) * dstStride + j;
-      localBuffer[index] = (%s)(%s)0;
+      for (int k = 0; k < STRUCT_SIZE; k++) {
+        localBuffer[index].byte[k] = 0;
+      }
     }
   }
 
@@ -45,7 +53,7 @@
   // try the copy
   barrier( CLK_LOCAL_MEM_FENCE );
   event_t event = async_work_group_copy_2D2D(localBuffer, 0, src,
-    lineCopiesPerWorkgroup * get_group_id(0) * srcStride, sizeof(%s),
+    lineCopiesPerWorkgroup * get_group_id(0) * srcStride, sizeof(VarSizeStruct),
     (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, srcStride, dstStride, 0);
 
   // Wait for the copy to complete, then verify by manually copying to the dest
@@ -63,16 +71,24 @@
 
 static const char *async_local_to_global_kernel2D = R"OpenCLC(
 #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
-%s // optional pragma string
 
-__kernel void test_fn(const __global %s *src, __global %s *dst, __local %s *localBuffer,
+#define STRUCT_SIZE %d
+typedef struct __attribute__((packed))
+{
+    uchar byte[STRUCT_SIZE];
+} VarSizeStruct __attribute__((aligned(1)));
+
+
+__kernel void test_fn(const __global VarSizeStruct *src, __global VarSizeStruct *dst, __local VarSizeStruct *localBuffer,
                       int numElementsPerLine, int lineCopiesPerWorkgroup,
                       int lineCopiesPerWorkItem, int srcStride, int dstStride) {
   // Zero the local storage first
   for (int i = 0; i < lineCopiesPerWorkItem; i++) {
     for (int j = 0; j < numElementsPerLine; j++) {
       const int index = (get_local_id(0) * lineCopiesPerWorkItem + i) * srcStride + j;
-      localBuffer[index] = (%s)(%s)0;
+      for (int k = 0; k < STRUCT_SIZE; k++) {
+        localBuffer[index].byte[k] = 0;
+      }
     }
   }
 
@@ -90,36 +106,22 @@
   // Do this to verify all kernels are done copying to the local buffer before we try the copy
   barrier(CLK_LOCAL_MEM_FENCE);
   event_t event = async_work_group_copy_2D2D(dst, lineCopiesPerWorkgroup * get_group_id(0) * dstStride,
-    localBuffer, 0, sizeof(%s), (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, srcStride,
+    localBuffer, 0, sizeof(VarSizeStruct), (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, srcStride,
    dstStride, 0 );
 
   wait_group_events(1, &event);
 };
 )OpenCLC";
 
-int test_copy2D(cl_device_id deviceID, cl_context context,
-                cl_command_queue queue, const char *kernelCode,
-                ExplicitType vecType, int vecSize, int srcMargin, int dstMargin,
-                bool localIsDst)
+int test_copy2D(const cl_device_id deviceID, const cl_context context,
+                const cl_command_queue queue, const char *const kernelCode,
+                const size_t elementSize, const int srcMargin,
+                const int dstMargin, const bool localIsDst)
 {
     int error;
-    clProgramWrapper program;
-    clKernelWrapper kernel;
-    clMemWrapper streams[2];
-    size_t threads[1], localThreads[1];
-    void *inBuffer, *outBuffer, *outBufferCopy;
-    MTdata d;
-    char vecNameString[64];
-    vecNameString[0] = 0;
-    if (vecSize == 1)
-        sprintf(vecNameString, "%s", get_explicit_type_name(vecType));
-    else
-        sprintf(vecNameString, "%s%d", get_explicit_type_name(vecType),
-                vecSize);
 
-    size_t elementSize = get_explicit_type_size(vecType) * vecSize;
-    log_info("Testing %s with srcMargin = %d, dstMargin = %d\n", vecNameString,
-             srcMargin, dstMargin);
+    log_info("Testing %d byte element with srcMargin = %d, dstMargin = %d\n",
+             elementSize, srcMargin, dstMargin);
 
     cl_long max_local_mem_size;
     error =
@@ -139,6 +141,13 @@
     test_error(error,
                "clGetDeviceInfo for CL_DEVICE_MAX_MEM_ALLOC_SIZE failed.");
 
+    cl_long max_work_group_size;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                            sizeof(max_work_group_size), &max_work_group_size,
+                            NULL);
+    test_error(error,
+               "clGetDeviceInfo for CL_DEVICE_MAX_WORK_GROUP_SIZE failed.");
+
     if (max_alloc_size > max_global_mem_size / 2)
         max_alloc_size = max_global_mem_size / 2;
 
@@ -149,20 +158,17 @@
     test_error(error,
                "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
 
-    char programSource[4096];
-    programSource[0] = 0;
-    char *programPtr;
+    char programSource[4096] = { 0 };
+    const char *programPtr = programSource;
 
-    sprintf(programSource, kernelCode,
-            vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"
-                               : "",
-            vecNameString, vecNameString, vecNameString, vecNameString,
-            get_explicit_type_name(vecType), vecNameString);
+    sprintf(programSource, kernelCode, elementSize);
     // log_info("program: %s\n", programSource);
-    programPtr = programSource;
+
+    clProgramWrapper program;
+    clKernelWrapper kernel;
 
     error = create_single_kernel_helper(context, &program, &kernel, 1,
-                                        (const char **)&programPtr, "test_fn");
+                                        &programPtr, "test_fn");
     test_error(error, "Unable to create testing kernel");
 
     size_t max_workgroup_size;
@@ -188,9 +194,6 @@
     const cl_int dstStride = numElementsPerLine + dstMargin;
     const cl_int srcStride = numElementsPerLine + srcMargin;
 
-    elementSize =
-        get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize);
-
     const size_t lineCopiesPerWorkItem = 13;
     const size_t localStorageSpacePerWorkitem = lineCopiesPerWorkItem
         * elementSize * (localIsDst ? dstStride : srcStride);
@@ -208,7 +211,6 @@
     if (maxLocalWorkgroupSize > max_workgroup_size)
         localWorkgroupSize = max_workgroup_size;
 
-
     const size_t maxTotalLinesIn =
         (max_alloc_size / elementSize + srcMargin) / srcStride;
     const size_t maxTotalLinesOut =
@@ -231,9 +233,17 @@
     const size_t globalWorkgroupSize =
         numberOfLocalWorkgroups * localWorkgroupSize;
 
-    inBuffer = (void *)malloc(inBufferSize);
-    outBuffer = (void *)malloc(outBufferSize);
-    outBufferCopy = (void *)malloc(outBufferSize);
+    if ((localBufferSize / 4) > max_work_group_size)
+    {
+        log_info("Skipping due to resource requirements local:%db  "
+                 "max_work_group_size:%d\n",
+                 localBufferSize, max_work_group_size);
+        return 0;
+    }
+
+    void *const inBuffer = (void *)malloc(inBufferSize);
+    void *const outBuffer = (void *)malloc(outBufferSize);
+    void *const outBufferCopy = (void *)malloc(outBufferSize);
 
     const cl_int lineCopiesPerWorkItemInt =
         static_cast<cl_int>(lineCopiesPerWorkItem);
@@ -250,18 +260,20 @@
         (int)inBufferSize, (int)outBufferSize, lineCopiesPerWorkgroup,
         lineCopiesPerWorkItemInt);
 
+    size_t threads[1], localThreads[1];
+
     threads[0] = globalWorkgroupSize;
     localThreads[0] = localWorkgroupSize;
 
-    d = init_genrand(gRandomSeed);
-    generate_random_data(
-        vecType, inBufferSize / get_explicit_type_size(vecType), d, inBuffer);
-    generate_random_data(
-        vecType, outBufferSize / get_explicit_type_size(vecType), d, outBuffer);
+    MTdata d = init_genrand(gRandomSeed);
+    generate_random_data(kChar, inBufferSize, d, inBuffer);
+    generate_random_data(kChar, outBufferSize, d, outBuffer);
     free_mtdata(d);
     d = NULL;
     memcpy(outBufferCopy, outBuffer, outBufferSize);
 
+    clMemWrapper streams[2];
+
     streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, inBufferSize,
                                 inBuffer, &error);
     test_error(error, "Unable to create input buffer");
@@ -301,8 +313,7 @@
 
     // Verify
     int failuresPrinted = 0;
-    // Verify
-    size_t typeSize = get_explicit_type_size(vecType) * vecSize;
+
     for (int i = 0;
          i < (int)globalWorkgroupSize * lineCopiesPerWorkItem * elementSize;
          i += elementSize)
@@ -313,13 +324,12 @@
             int inIdx = i * srcStride + j;
             int outIdx = i * dstStride + j;
             if (memcmp(((char *)inBuffer) + inIdx, ((char *)outBuffer) + outIdx,
-                       typeSize)
+                       elementSize)
                 != 0)
             {
                 unsigned char *inchar = (unsigned char *)inBuffer + inIdx;
                 unsigned char *outchar = (unsigned char *)outBuffer + outIdx;
-                char values[4096];
-                values[0] = 0;
+                char values[4096] = { 0 };
 
                 if (failuresPrinted == 0)
                 {
@@ -382,16 +392,14 @@
                           cl_command_queue queue, const char *kernelCode,
                           bool localIsDst)
 {
-    ExplicitType vecType[] = {
-        kChar,  kUChar, kShort,  kUShort,          kInt, kUInt, kLong,
-        kULong, kFloat, kDouble, kNumExplicitTypes
-    };
+    const unsigned int elemSizes[] = { 1, 2,  3,  4,  5,  6, 7,
+                                       8, 13, 16, 32, 47, 64 };
     // The margins below represent the number of elements between the end of
     // one line and the start of the next. The strides are equivalent to the
     // length of the line plus the chosen margin.
-    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
-    unsigned int smallTypesMarginSizes[] = { 0, 10, 100 };
-    unsigned int size, typeIndex, srcMargin, dstMargin;
+    // These have to be multipliers, because the margin must be a multiple of
+    // element size.
+    const unsigned int marginMultipliers[] = { 0, 10, 100 };
 
     int errors = 0;
 
@@ -399,55 +407,27 @@
     {
         log_info(
             "Device does not support extended async copies. Skipping test.\n");
-        return 0;
     }
-
-    for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
+    else
     {
-        if (vecType[typeIndex] == kDouble
-            && !is_extension_available(deviceID, "cl_khr_fp64"))
-            continue;
-
-        if ((vecType[typeIndex] == kLong || vecType[typeIndex] == kULong)
-            && !gHasLong)
-            continue;
-
-        for (size = 0; vecSizes[size] != 0; size++)
+        for (const unsigned int elemSize : elemSizes)
         {
-            if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size]
-                <= 2) // small type
+            for (const unsigned int srcMarginMultiplier : marginMultipliers)
             {
-                for (srcMargin = 0; srcMargin < sizeof(smallTypesMarginSizes)
-                         / sizeof(smallTypesMarginSizes[0]);
-                     srcMargin++)
+                for (const unsigned int dstMarginMultiplier : marginMultipliers)
                 {
-                    for (dstMargin = 0;
-                         dstMargin < sizeof(smallTypesMarginSizes)
-                             / sizeof(smallTypesMarginSizes[0]);
-                         dstMargin++)
+                    if (test_copy2D(deviceID, context, queue, kernelCode,
+                                    elemSize, srcMarginMultiplier * elemSize,
+                                    dstMarginMultiplier * elemSize, localIsDst))
                     {
-                        if (test_copy2D(deviceID, context, queue, kernelCode,
-                                        vecType[typeIndex], vecSizes[size],
-                                        smallTypesMarginSizes[srcMargin],
-                                        smallTypesMarginSizes[dstMargin],
-                                        localIsDst))
-                        {
-                            errors++;
-                        }
+                        errors++;
                     }
                 }
             }
-            // not a small type, check only zero stride
-            else if (test_copy2D(deviceID, context, queue, kernelCode,
-                                 vecType[typeIndex], vecSizes[size], 0, 0,
-                                 localIsDst))
-            {
-                errors++;
-            }
         }
     }
-    if (errors) return -1;
-    return 0;
+
+    return errors ? -1 : 0;
 }
 
 int test_async_copy_global_to_local2D(cl_device_id deviceID, cl_context context,
diff --git a/test_conformance/basic/test_async_copy3D.cpp b/test_conformance/basic/test_async_copy3D.cpp
index 5eb41eb..aa22f3a 100644
--- a/test_conformance/basic/test_async_copy3D.cpp
+++ b/test_conformance/basic/test_async_copy3D.cpp
@@ -27,9 +27,14 @@
 
 static const char *async_global_to_local_kernel3D = R"OpenCLC(
 #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
-%s // optional pragma string
 
-__kernel void test_fn(const __global %s *src, __global %s *dst, __local %s *localBuffer,
+#define STRUCT_SIZE %d
+typedef struct __attribute__((packed))
+{
+    uchar byte[STRUCT_SIZE];
+} VarSizeStruct __attribute__((aligned(1)));
+
+__kernel void test_fn(const __global VarSizeStruct *src, __global VarSizeStruct *dst, __local VarSizeStruct *localBuffer,
                       int numElementsPerLine, int numLines, int planesCopiesPerWorkgroup,
                       int planesCopiesPerWorkItem, int srcLineStride,
                       int dstLineStride, int srcPlaneStride, int dstPlaneStride ) {
@@ -38,7 +43,9 @@
     for (int j = 0; j < numLines; j++) {
       for (int k = 0; k < numElementsPerLine; k++) {
         const int index = (get_local_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k;
-        localBuffer[index] = (%s)(%s)0;
+        for (int k = 0; k < STRUCT_SIZE; k++) {
+          localBuffer[index].byte[k] = 0;
+        }
       }
     }
   }
@@ -48,7 +55,7 @@
 
   event_t event = async_work_group_copy_3D3D(localBuffer, 0, src,
     planesCopiesPerWorkgroup * get_group_id(0) * srcPlaneStride,
-    sizeof(%s), (size_t)numElementsPerLine, (size_t)numLines,
+    sizeof(VarSizeStruct), (size_t)numElementsPerLine, (size_t)numLines,
     planesCopiesPerWorkgroup, srcLineStride, srcPlaneStride, dstLineStride,
     dstPlaneStride, 0);
 
@@ -69,9 +76,14 @@
 
 static const char *async_local_to_global_kernel3D = R"OpenCLC(
 #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
-%s // optional pragma string
 
-__kernel void test_fn(const __global %s *src, __global %s *dst, __local %s *localBuffer,
+#define STRUCT_SIZE %d
+typedef struct __attribute__((packed))
+{
+    uchar byte[STRUCT_SIZE];
+} VarSizeStruct __attribute__((aligned(1)));
+
+__kernel void test_fn(const __global VarSizeStruct *src, __global VarSizeStruct *dst, __local VarSizeStruct *localBuffer,
                       int numElementsPerLine, int numLines, int planesCopiesPerWorkgroup,
                       int planesCopiesPerWorkItem, int srcLineStride,
                       int dstLineStride, int srcPlaneStride, int dstPlaneStride) {
@@ -80,7 +92,9 @@
     for (int j = 0; j < numLines; j++) {
       for (int k = 0; k < numElementsPerLine; k++) {
         const int index = (get_local_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j * srcLineStride + k;
-        localBuffer[index] = (%s)(%s)0;
+        for (int k = 0; k < STRUCT_SIZE; k++) {
+          localBuffer[index].byte[k] = 0;
+        }
       }
     }
   }
@@ -103,39 +117,26 @@
 
   event_t event = async_work_group_copy_3D3D(dst,
     planesCopiesPerWorkgroup * get_group_id(0) * dstPlaneStride, localBuffer, 0,
-    sizeof(%s), (size_t)numElementsPerLine, (size_t)numLines, planesCopiesPerWorkgroup,
+    sizeof(VarSizeStruct), (size_t)numElementsPerLine, (size_t)numLines, planesCopiesPerWorkgroup,
     srcLineStride, srcPlaneStride, dstLineStride, dstPlaneStride, 0);
 
   wait_group_events(1, &event);
 }
 )OpenCLC";
 
-int test_copy3D(cl_device_id deviceID, cl_context context,
-                cl_command_queue queue, const char *kernelCode,
-                ExplicitType vecType, int vecSize, int srcLineMargin,
-                int dstLineMargin, int srcPlaneMargin, int dstPlaneMargin,
-                bool localIsDst)
+int test_copy3D(const cl_device_id deviceID, const cl_context context,
+                const cl_command_queue queue, const char *const kernelCode,
+                const size_t elementSize, const int srcLineMargin,
+                const int dstLineMargin, const int srcPlaneMargin,
+                const int dstPlaneMargin, const bool localIsDst)
 {
     int error;
-    clProgramWrapper program;
-    clKernelWrapper kernel;
-    clMemWrapper streams[2];
-    size_t threads[1], localThreads[1];
-    void *inBuffer, *outBuffer, *outBufferCopy;
-    MTdata d;
-    char vecNameString[64];
-    vecNameString[0] = 0;
-    if (vecSize == 1)
-        sprintf(vecNameString, "%s", get_explicit_type_name(vecType));
-    else
-        sprintf(vecNameString, "%s%d", get_explicit_type_name(vecType),
-                vecSize);
 
-    size_t elementSize = get_explicit_type_size(vecType) * vecSize;
-    log_info("Testing %s with srcLineMargin = %d, dstLineMargin = %d, "
-             "srcPlaneMargin = %d, dstPlaneMargin = %d\n",
-             vecNameString, srcLineMargin, dstLineMargin, srcPlaneMargin,
-             dstPlaneMargin);
+    log_info(
+        "Testing %d byte element with srcLineMargin = %d, dstLineMargin = %d, "
+        "srcPlaneMargin = %d, dstPlaneMargin = %d\n",
+        elementSize, srcLineMargin, dstLineMargin, srcPlaneMargin,
+        dstPlaneMargin);
 
     cl_long max_local_mem_size;
     error =
@@ -165,20 +166,16 @@
     test_error(error,
                "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
 
-    char programSource[4096];
-    programSource[0] = 0;
-    char *programPtr;
+    char programSource[4096] = { 0 };
+    const char *programPtr = programSource;
 
-    sprintf(programSource, kernelCode,
-            vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"
-                               : "",
-            vecNameString, vecNameString, vecNameString, vecNameString,
-            get_explicit_type_name(vecType), vecNameString, vecNameString);
+    sprintf(programSource, kernelCode, elementSize);
     // log_info("program: %s\n", programSource);
-    programPtr = programSource;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
 
     error = create_single_kernel_helper(context, &program, &kernel, 1,
-                                        (const char **)&programPtr, "test_fn");
+                                        &programPtr, "test_fn");
     test_error(error, "Unable to create testing kernel");
 
     size_t max_workgroup_size;
@@ -196,6 +193,13 @@
     test_error(error,
                "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
 
+    cl_long max_work_group_size;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                            sizeof(max_work_group_size), &max_work_group_size,
+                            NULL);
+    test_error(error,
+               "clGetDeviceInfo for CL_DEVICE_MAX_WORK_GROUP_SIZE failed.");
+
     // Pick the minimum of the device and the kernel
     if (max_workgroup_size > max_local_workgroup_size[0])
         max_workgroup_size = max_local_workgroup_size[0];
@@ -208,8 +212,6 @@
     const cl_int dstPlaneStride = (numLines * dstLineStride) + dstPlaneMargin;
     const cl_int srcPlaneStride = (numLines * srcLineStride) + srcPlaneMargin;
 
-    elementSize =
-        get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize);
     const size_t planesCopiesPerWorkItem = 2;
     const size_t localStorageSpacePerWorkitem = elementSize
         * planesCopiesPerWorkItem
@@ -251,9 +253,17 @@
     const size_t globalWorkgroupSize =
         numberOfLocalWorkgroups * localWorkgroupSize;
 
-    inBuffer = (void *)malloc(inBufferSize);
-    outBuffer = (void *)malloc(outBufferSize);
-    outBufferCopy = (void *)malloc(outBufferSize);
+    if ((localBufferSize / 4) > max_work_group_size)
+    {
+        log_info("Skipping due to resource requirements local:%db  "
+                 "max_work_group_size:%d\n",
+                 localBufferSize, max_work_group_size);
+        return 0;
+    }
+
+    void *const inBuffer = (void *)malloc(inBufferSize);
+    void *const outBuffer = (void *)malloc(outBufferSize);
+    void *const outBufferCopy = (void *)malloc(outBufferSize);
 
     const cl_int planesCopiesPerWorkItemInt =
         static_cast<cl_int>(planesCopiesPerWorkItem);
@@ -270,18 +280,20 @@
              (int)localBufferSize, (int)inBufferSize, (int)outBufferSize,
              planesCopiesPerWorkgroup, planesCopiesPerWorkItemInt);
 
+    size_t threads[1], localThreads[1];
+
     threads[0] = globalWorkgroupSize;
     localThreads[0] = localWorkgroupSize;
 
-    d = init_genrand(gRandomSeed);
-    generate_random_data(
-        vecType, inBufferSize / get_explicit_type_size(vecType), d, inBuffer);
-    generate_random_data(
-        vecType, outBufferSize / get_explicit_type_size(vecType), d, outBuffer);
+    MTdata d = init_genrand(gRandomSeed);
+    generate_random_data(kChar, inBufferSize, d, inBuffer);
+    generate_random_data(kChar, outBufferSize, d, outBuffer);
     free_mtdata(d);
     d = NULL;
     memcpy(outBufferCopy, outBuffer, outBufferSize);
 
+    clMemWrapper streams[2];
+
     streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, inBufferSize,
                                 inBuffer, &error);
     test_error(error, "Unable to create input buffer");
@@ -327,8 +339,7 @@
 
     // Verify
     int failuresPrinted = 0;
-    // Verify
-    size_t typeSize = get_explicit_type_size(vecType) * vecSize;
+
     for (int i = 0;
          i < (int)globalWorkgroupSize * planesCopiesPerWorkItem * elementSize;
          i += elementSize)
@@ -341,14 +352,13 @@
                 int inIdx = i * srcPlaneStride + j * srcLineStride + k;
                 int outIdx = i * dstPlaneStride + j * dstLineStride + k;
                 if (memcmp(((char *)inBuffer) + inIdx,
-                           ((char *)outBuffer) + outIdx, typeSize)
+                           ((char *)outBuffer) + outIdx, elementSize)
                     != 0)
                 {
                     unsigned char *inchar = (unsigned char *)inBuffer + inIdx;
                     unsigned char *outchar =
                         (unsigned char *)outBuffer + outIdx;
-                    char values[4096];
-                    values[0] = 0;
+                    char values[4096] = { 0 };
 
                     if (failuresPrinted == 0)
                     {
@@ -439,17 +449,14 @@
                           cl_command_queue queue, const char *kernelCode,
                           bool localIsDst)
 {
-    ExplicitType vecType[] = {
-        kChar,  kUChar, kShort,  kUShort,          kInt, kUInt, kLong,
-        kULong, kFloat, kDouble, kNumExplicitTypes
-    };
+    const unsigned int elemSizes[] = { 1, 2,  3,  4,  5,  6, 7,
+                                       8, 13, 16, 32, 47, 64 };
     // The margins below represent the number of elements between the end of
-    // one line or plane and the start of the next. The strides are equivalent
-    // to the size of the line or plane plus the chosen margin.
-    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
-    unsigned int smallTypesMarginSizes[] = { 0, 10, 100 };
-    unsigned int size, typeIndex, srcLineMargin, dstLineMargin, srcPlaneMargin,
-        dstPlaneMargin;
+    // one line and the start of the next. The strides are equivalent to the
+    // size of the line or plane plus the chosen margin.
+    // These have to be multipliers, because the margin must be a multiple of
+    // element size.
+    const unsigned int marginMultipliers[] = { 0, 10, 100 };
 
     int errors = 0;
 
@@ -457,67 +464,36 @@
     {
         log_info(
             "Device does not support extended async copies. Skipping test.\n");
-        return 0;
     }
-
-    for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
+    else
     {
-        if (vecType[typeIndex] == kDouble
-            && !is_extension_available(deviceID, "cl_khr_fp64"))
-            continue;
-
-        if ((vecType[typeIndex] == kLong || vecType[typeIndex] == kULong)
-            && !gHasLong)
-            continue;
-
-        for (size = 0; vecSizes[size] != 0; size++)
+        for (const unsigned int elemSize : elemSizes)
         {
-            if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size]
-                <= 2) // small type
+            for (const unsigned int srcLineMarginMultiplier : marginMultipliers)
             {
-                for (srcLineMargin = 0;
-                     srcLineMargin < sizeof(smallTypesMarginSizes)
-                         / sizeof(smallTypesMarginSizes[0]);
-                     srcLineMargin++)
+                for (const unsigned int dstLineMarginMultiplier :
+                     marginMultipliers)
                 {
-                    for (dstLineMargin = 0;
-                         dstLineMargin < sizeof(smallTypesMarginSizes)
-                             / sizeof(smallTypesMarginSizes[0]);
-                         dstLineMargin++)
+                    for (const unsigned int srcPlaneMarginMultiplier :
+                         marginMultipliers)
                     {
-                        for (srcPlaneMargin = 0;
-                             srcPlaneMargin < sizeof(smallTypesMarginSizes)
-                                 / sizeof(smallTypesMarginSizes[0]);
-                             srcPlaneMargin++)
+                        for (const unsigned int dstPlaneMarginMultiplier :
+                             marginMultipliers)
                         {
-                            for (dstPlaneMargin = 0;
-                                 dstPlaneMargin < sizeof(smallTypesMarginSizes)
-                                     / sizeof(smallTypesMarginSizes[0]);
-                                 dstPlaneMargin++)
+                            if (test_copy3D(deviceID, context, queue,
+                                            kernelCode, elemSize,
+                                            srcLineMarginMultiplier * elemSize,
+                                            dstLineMarginMultiplier * elemSize,
+                                            srcPlaneMarginMultiplier * elemSize,
+                                            dstPlaneMarginMultiplier * elemSize,
+                                            localIsDst))
                             {
-                                if (test_copy3D(
-                                        deviceID, context, queue, kernelCode,
-                                        vecType[typeIndex], vecSizes[size],
-                                        smallTypesMarginSizes[srcLineMargin],
-                                        smallTypesMarginSizes[dstLineMargin],
-                                        smallTypesMarginSizes[srcPlaneMargin],
-                                        smallTypesMarginSizes[dstPlaneMargin],
-                                        localIsDst))
-                                {
-                                    errors++;
-                                }
+                                errors++;
                             }
                         }
                     }
                 }
             }
-            // not a small type, check only zero stride
-            else if (test_copy3D(deviceID, context, queue, kernelCode,
-                                 vecType[typeIndex], vecSizes[size], 0, 0, 0, 0,
-                                 localIsDst))
-            {
-                errors++;
-            }
         }
     }
     if (errors) return -1;
diff --git a/test_conformance/basic/test_async_strided_copy.cpp b/test_conformance/basic/test_async_strided_copy.cpp
index c456f38..932e9b8 100644
--- a/test_conformance/basic/test_async_strided_copy.cpp
+++ b/test_conformance/basic/test_async_strided_copy.cpp
@@ -1,6 +1,6 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
+// Copyright (c) 2023 The Khronos Group Inc.
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -20,15 +20,16 @@
 #include <string.h>
 #include <sys/types.h>
 #include <sys/stat.h>
-
-
+#include <vector>
 
 #include "procs.h"
 #include "harness/conversions.h"
 
+// clang-format off
+
 static const char *async_strided_global_to_local_kernel =
 "%s\n" // optional pragma string
-"%s__kernel void test_fn( const __global %s *src, __global %s *dst, __local %s *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem, int stride )\n"
+"__kernel void test_fn( const __global %s *src, __global %s *dst, __local %s *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem, int stride )\n"
 "{\n"
 " int i;\n"
 // Zero the local storage first
@@ -46,7 +47,7 @@
 
 static const char *async_strided_local_to_global_kernel =
 "%s\n" // optional pragma string
-"%s__kernel void test_fn( const __global %s *src, __global %s *dst, __local %s *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem, int stride )\n"
+"__kernel void test_fn( const __global %s *src, __global %s *dst, __local %s *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem, int stride )\n"
 "{\n"
 " int i;\n"
 // Zero the local storage first
@@ -63,6 +64,7 @@
 " wait_group_events( 1, &event );\n"
 "}\n" ;
 
+// clang-format on
 
 int test_strided_copy(cl_device_id deviceID, cl_context context, cl_command_queue queue, const char *kernelCode, ExplicitType vecType, int vecSize, int stride)
 {
@@ -71,8 +73,7 @@
     clKernelWrapper kernel;
     clMemWrapper streams[ 2 ];
     size_t threads[ 1 ], localThreads[ 1 ];
-    void *inBuffer, *outBuffer;
-    MTdata d;
+    MTdataHolder d(gRandomSeed);
     char vecNameString[64]; vecNameString[0] = 0;
 
     if (vecSize == 1)
@@ -94,10 +95,15 @@
     char programSource[4096]; programSource[0]=0;
     char *programPtr;
 
-    sprintf(programSource, kernelCode,
-        vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
-        "",
-        vecNameString, vecNameString, vecNameString, vecNameString, get_explicit_type_name(vecType), vecNameString, vecNameString);
+    std::string extStr = "";
+    if (vecType == kDouble)
+        extStr = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable";
+    else if (vecType == kHalf)
+        extStr = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable";
+
+    sprintf(programSource, kernelCode, extStr.c_str(), vecNameString,
+            vecNameString, vecNameString, vecNameString,
+            get_explicit_type_name(vecType), vecNameString, vecNameString);
     //log_info("program: %s\n", programSource);
     programPtr = programSource;
 
@@ -151,9 +157,9 @@
     size_t globalBufferSize = numberOfLocalWorkgroups*localBufferSize*stride;
     size_t globalWorkgroupSize = numberOfLocalWorkgroups*localWorkgroupSize;
 
-    inBuffer = (void*)malloc(globalBufferSize);
-    outBuffer = (void*)malloc(globalBufferSize);
-    memset(outBuffer, 0, globalBufferSize);
+    std::vector<unsigned char> inBuffer(globalBufferSize);
+    std::vector<unsigned char> outBuffer(globalBufferSize);
+    memset(outBuffer.data(), 0, globalBufferSize);
 
     cl_int copiesPerWorkItemInt, copiesPerWorkgroup;
     copiesPerWorkItemInt = (int)numberOfCopiesPerWorkitem;
@@ -165,13 +171,15 @@
     threads[0] = globalWorkgroupSize;
     localThreads[0] = localWorkgroupSize;
 
-    d = init_genrand( gRandomSeed );
-    generate_random_data( vecType, globalBufferSize/get_explicit_type_size(vecType), d, inBuffer );
-    free_mtdata(d); d = NULL;
+    generate_random_data(vecType,
+                         globalBufferSize / get_explicit_type_size(vecType), d,
+                         inBuffer.data());
 
-    streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, globalBufferSize, inBuffer, &error );
+    streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, globalBufferSize,
+                                inBuffer.data(), &error);
     test_error( error, "Unable to create input buffer" );
-    streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, globalBufferSize, outBuffer, &error );
+    streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, globalBufferSize,
+                                outBuffer.data(), &error);
     test_error( error, "Unable to create output buffer" );
 
     error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[ 0 ] );
@@ -192,17 +200,20 @@
     test_error( error, "Unable to queue kernel" );
 
     // Read
-    error = clEnqueueReadBuffer( queue, streams[ 1 ], CL_TRUE, 0, globalBufferSize, outBuffer, 0, NULL, NULL );
+    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, globalBufferSize,
+                                outBuffer.data(), 0, NULL, NULL);
     test_error( error, "Unable to read results" );
 
     // Verify
     size_t typeSize = get_explicit_type_size(vecType)* vecSize;
     for (int i=0; i<(int)globalBufferSize; i+=(int)elementSize*(int)stride)
     {
-        if (memcmp( ((char *)inBuffer)+i, ((char *)outBuffer)+i, typeSize) != 0 )
+        if (memcmp(&inBuffer.at(i), &outBuffer.at(i), typeSize) != 0)
         {
-            unsigned char * inchar = (unsigned char*)inBuffer + i;
-            unsigned char * outchar = (unsigned char*)outBuffer + i;
+            unsigned char *inchar =
+                static_cast<unsigned char *>(&inBuffer.at(i));
+            unsigned char *outchar =
+                static_cast<unsigned char *>(&outBuffer.at(i));
             char values[4096];
             values[0] = 0;
 
@@ -215,34 +226,35 @@
                 sprintf(values + strlen( values), "%2x ", outchar[j]);
             sprintf(values + strlen(values), "]");
             log_error("%s\n", values);
-            free(inBuffer);
-            free(outBuffer);
             return -1;
         }
     }
 
-    free(inBuffer);
-    free(outBuffer);
-
     return 0;
 }
 
 int test_strided_copy_all_types(cl_device_id deviceID, cl_context context, cl_command_queue queue, const char *kernelCode)
 {
-    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
-    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
-    unsigned int strideSizes[] = { 1, 3, 4, 5, 0 };
+    const std::vector<ExplicitType> vecType = { kChar,  kUChar, kShort, kUShort,
+                                                kInt,   kUInt,  kLong,  kULong,
+                                                kFloat, kHalf,  kDouble };
+    const unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
+    const unsigned int strideSizes[] = { 1, 3, 4, 5, 0 };
     unsigned int size, typeIndex, stride;
 
     int errors = 0;
 
-    for( typeIndex = 0; vecType[ typeIndex ] != kNumExplicitTypes; typeIndex++ )
-    {
-        if( vecType[ typeIndex ] == kDouble && !is_extension_available( deviceID, "cl_khr_fp64" ) )
-            continue;
+    bool fp16Support = is_extension_available(deviceID, "cl_khr_fp16");
+    bool fp64Support = is_extension_available(deviceID, "cl_khr_fp64");
 
+    for (typeIndex = 0; typeIndex < vecType.size(); typeIndex++)
+    {
         if (( vecType[ typeIndex ] == kLong || vecType[ typeIndex ] == kULong ) && !gHasLong )
             continue;
+        else if (vecType[typeIndex] == kDouble && !fp64Support)
+            continue;
+        else if (vecType[typeIndex] == kHalf && !fp16Support)
+            continue;
 
         for( size = 0; vecSizes[ size ] != 0; size++ )
         {
@@ -260,9 +272,6 @@
     return 0;
 }
 
-
-
-
 int test_async_strided_copy_global_to_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return test_strided_copy_all_types( deviceID, context, queue, async_strided_global_to_local_kernel );
diff --git a/test_conformance/basic/test_barrier.cpp b/test_conformance/basic/test_barrier.cpp
index d20af14..6352b42 100644
--- a/test_conformance/basic/test_barrier.cpp
+++ b/test_conformance/basic/test_barrier.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -21,143 +21,136 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <algorithm>
+#include <numeric>
+#include <vector>
 
 #include "procs.h"
 
-const char *barrier_kernel_code =
-"__kernel void compute_sum(__global int *a, int n, __global int *tmp_sum, __global int *sum)\n"
-"{\n"
-"    int  tid = get_local_id(0);\n"
-"    int  lsize = get_local_size(0);\n"
-"    int  i;\n"
-"\n"
-"    tmp_sum[tid] = 0;\n"
-"    for (i=tid; i<n; i+=lsize)\n"
-"        tmp_sum[tid] += a[i];\n"
-"     \n"
-"     // updated to work for any workgroup size \n"
-"    for (i=hadd(lsize,1); lsize>1; i = hadd(i,1))\n"
-"    {\n"
-"        barrier(CLK_GLOBAL_MEM_FENCE);\n"
-"        if (tid + i < lsize)\n"
-"            tmp_sum[tid] += tmp_sum[tid + i];\n"
-"         lsize = i; \n"
-"    }\n"
-"\n"
-"     //no barrier is required here because last person to write to tmp_sum[0] was tid 0 \n"
-"    if (tid == 0)\n"
-"        *sum = tmp_sum[0];\n"
-"}\n";
-
-
-static int
-verify_sum(int *inptr, int *outptr, int n)
+namespace {
+const char *barrier_kernel_code = R"(
+__kernel void compute_sum(__global int *a, int n, __global int *tmp_sum,
+                          __global int *sum)
 {
-  int            r = 0;
-  int         i;
+    int tid = get_local_id(0);
+    int lsize = get_local_size(0);
+    int i;
 
-  for (i=0; i<n; i++)
-  {
-        r += inptr[i];
-  }
+    tmp_sum[tid] = 0;
+    for (i = tid; i < n; i += lsize) tmp_sum[tid] += a[i];
 
-    if (r != outptr[0])
+    // updated to work for any workgroup size
+    for (i = hadd(lsize, 1); lsize > 1; i = hadd(i, 1))
     {
-        log_error("BARRIER test failed\n");
-        return -1;
+        BARRIER(CLK_GLOBAL_MEM_FENCE);
+        if (tid + i < lsize) tmp_sum[tid] += tmp_sum[tid + i];
+        lsize = i;
     }
 
-  log_info("BARRIER test passed\n");
-  return 0;
+    // no barrier is required here because last person to write to tmp_sum[0]
+    // was tid 0
+    if (tid == 0) *sum = tmp_sum[0];
+}
+)";
+
+
+void generate_random_inputs(std::vector<cl_int> &v)
+{
+    RandomSeed seed(gRandomSeed);
+
+    auto random_generator = [&seed]() {
+        return static_cast<cl_int>(
+            get_random_float(-0x01000000, 0x01000000, seed));
+    };
+
+    std::generate(v.begin(), v.end(), random_generator);
 }
 
-
-int
-test_barrier(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+int test_barrier_common(cl_device_id device, cl_context context,
+                        cl_command_queue queue, int num_elements,
+                        std::string barrier_str)
 {
-    cl_mem            streams[3];
-    cl_int            *input_ptr = NULL, *output_ptr = NULL;
-    cl_program        program;
-    cl_kernel        kernel;
-    size_t    global_threads[3];
-    size_t    local_threads[3];
-    int                err;
-    int                i;
-    size_t max_local_workgroup_size[3];
-    size_t max_threadgroup_size = 0;
-    MTdata d;
+    clMemWrapper streams[3];
+    clProgramWrapper program;
+    clKernelWrapper kernel;
 
-    err = create_single_kernel_helper(context, &program, &kernel, 1, &barrier_kernel_code, "compute_sum" );
+    cl_int output;
+    int err;
+
+    size_t max_threadgroup_size = 0;
+    std::string build_options = std::string("-DBARRIER=") + barrier_str;
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &barrier_kernel_code, "compute_sum",
+                                      build_options.c_str());
     test_error(err, "Failed to build kernel/program.");
 
-    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
-                                 sizeof(max_threadgroup_size), &max_threadgroup_size, NULL);
-    test_error(err, "clGetKernelWorkgroupInfo failed.");
-
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL);
-    test_error(err, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
-
-    // Pick the minimum of the device and the kernel
-    if (max_threadgroup_size > max_local_workgroup_size[0])
-        max_threadgroup_size = max_local_workgroup_size[0];
+    err = get_max_allowed_1d_work_group_size_on_device(device, kernel,
+                                                       &max_threadgroup_size);
+    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed.");
 
     // work group size must divide evenly into the global size
-    while( num_elements % max_threadgroup_size )
-        max_threadgroup_size--;
+    while (num_elements % max_threadgroup_size) max_threadgroup_size--;
 
-    input_ptr = (int*)malloc(sizeof(int) * num_elements);
-    output_ptr = (int*)malloc(sizeof(int));
+    std::vector<cl_int> input(num_elements);
 
     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, &err);
+                                sizeof(cl_int) * num_elements, nullptr, &err);
     test_error(err, "clCreateBuffer failed.");
-    streams[1] =
-        clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &err);
+    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int),
+                                nullptr, &err);
     test_error(err, "clCreateBuffer failed.");
     streams[2] =
         clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(cl_int) * max_threadgroup_size, NULL, &err);
+                       sizeof(cl_int) * max_threadgroup_size, nullptr, &err);
     test_error(err, "clCreateBuffer failed.");
 
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        input_ptr[i] = (int)get_random_float(-0x01000000, 0x01000000, d);
-    free_mtdata(d);  d = NULL;
+    generate_random_inputs(input);
 
-    err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, sizeof(cl_int)*num_elements, (void *)input_ptr, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0,
+                               sizeof(cl_int) * num_elements, input.data(), 0,
+                               nullptr, nullptr);
     test_error(err, "clEnqueueWriteBuffer failed.");
 
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof num_elements, &num_elements);
-    err |= clSetKernelArg(kernel, 2, sizeof streams[2], &streams[2]);
-    err |= clSetKernelArg(kernel, 3, sizeof streams[1], &streams[1]);
+    err = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
+    err |= clSetKernelArg(kernel, 1, sizeof(num_elements), &num_elements);
+    err |= clSetKernelArg(kernel, 2, sizeof(streams[2]), &streams[2]);
+    err |= clSetKernelArg(kernel, 3, sizeof(streams[1]), &streams[1]);
     test_error(err, "clSetKernelArg failed.");
 
-    global_threads[0] = max_threadgroup_size;
-    local_threads[0] = max_threadgroup_size;
+    size_t global_threads[] = { max_threadgroup_size };
+    size_t local_threads[] = { max_threadgroup_size };
 
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, global_threads, local_threads, 0, NULL, NULL );
+    err = clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, global_threads,
+                                 local_threads, 0, nullptr, nullptr);
     test_error(err, "clEnqueueNDRangeKernel failed.");
 
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int), (void *)output_ptr, 0, NULL, NULL );
+    err = clEnqueueReadBuffer(queue, streams[1], true, 0, sizeof(cl_int),
+                              &output, 0, nullptr, nullptr);
     test_error(err, "clEnqueueReadBuffer failed.");
 
-        err = verify_sum(input_ptr, output_ptr, num_elements);
-
-
-    // cleanup
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseMemObject(streams[2]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr);
-    free(output_ptr);
+    if (std::accumulate(input.begin(), input.end(), 0) != output)
+    {
+        log_error("%s test failed\n", barrier_str.c_str());
+        err = -1;
+    }
+    else
+    {
+        log_info("%s test passed\n", barrier_str.c_str());
+    }
 
     return err;
 }
+}
 
+int test_barrier(cl_device_id device, cl_context context,
+                 cl_command_queue queue, int num_elements)
+{
+    return test_barrier_common(device, context, queue, num_elements, "barrier");
+}
 
-
-
-
+int test_wg_barrier(cl_device_id device, cl_context context,
+                    cl_command_queue queue, int num_elements)
+{
+    return test_barrier_common(device, context, queue, num_elements,
+                               "work_group_barrier");
+}
diff --git a/test_conformance/basic/test_constant.cpp b/test_conformance/basic/test_constant.cpp
index ed25c6e..fc2667e 100644
--- a/test_conformance/basic/test_constant.cpp
+++ b/test_conformance/basic/test_constant.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -21,41 +21,44 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <algorithm>
+#include <vector>
 
 #include "procs.h"
 
-const char *constant_kernel_code =
-"__kernel void constant_kernel(__global float *out, __constant float *tmpF, __constant int *tmpI)\n"
-"{\n"
-"    int  tid = get_global_id(0);\n"
-"\n"
-"    float ftmp = tmpF[tid]; \n"
-"    float Itmp = tmpI[tid]; \n"
-"    out[tid] = ftmp * Itmp; \n"
-"}\n";
-
-const char *loop_constant_kernel_code =
-"kernel void loop_constant_kernel(global float *out, constant float *i_pos, int num)\n"
-"{\n"
-"    int tid = get_global_id(0);\n"
-"    float sum = 0;\n"
-"    for (int i = 0; i < num; i++) {\n"
-"        float  pos  = i_pos[i*3];\n"
-"        sum += pos;\n"
-"    }\n"
-"    out[tid] = sum;\n"
-"}\n";
-
-
-static int
-verify(cl_float *tmpF, cl_int *tmpI, cl_float *out, int n)
+namespace {
+const char* constant_kernel_code = R"(
+__kernel void constant_kernel(__global float *out, __constant float *tmpF, __constant int *tmpI)
 {
-    int         i;
+    int  tid = get_global_id(0);
 
-    for (i=0; i < n; i++)
+    float ftmp = tmpF[tid];
+    float Itmp = tmpI[tid];
+    out[tid] = ftmp * Itmp;
+}
+)";
+
+const char* loop_constant_kernel_code = R"(
+kernel void loop_constant_kernel(global float *out, constant float *i_pos, int num)
+{
+    int tid = get_global_id(0);
+    float sum = 0;
+    for (int i = 0; i < num; i++) {
+        float  pos  = i_pos[i*3];
+        sum += pos;
+    }
+    out[tid] = sum;
+}
+)";
+
+
+int verify(std::vector<cl_float>& tmpF, std::vector<cl_int>& tmpI,
+           std::vector<cl_float>& out)
+{
+    for (int i = 0; i < out.size(); i++)
     {
         float f = tmpF[i] * tmpI[i];
-        if( out[i] != f )
+        if (out[i] != f)
         {
             log_error("CONSTANT test failed\n");
             return -1;
@@ -66,214 +69,172 @@
     return 0;
 }
 
-
-static int
-verify_loop_constant(const cl_float *tmp, cl_float *out, cl_int l, int n)
+int verify_loop_constant(const std::vector<cl_float>& tmp,
+                         std::vector<cl_float>& out, cl_int l)
 {
-    int i;
-    cl_int j;
-    for (i=0; i < n; i++)
-    {
-        float sum = 0;
-        for (j=0; j < l; ++j)
-            sum += tmp[j*3];
+    float sum = 0;
+    for (int j = 0; j < l; ++j) sum += tmp[j * 3];
 
-        if( out[i] != sum )
-        {
-            log_error("loop CONSTANT test failed\n");
-            return -1;
-        }
+    auto predicate = [&sum](cl_float elem) { return sum != elem; };
+
+    if (std::any_of(out.cbegin(), out.cend(), predicate))
+    {
+        log_error("loop CONSTANT test failed\n");
+        return -1;
     }
 
     log_info("loop CONSTANT test passed\n");
     return 0;
 }
 
-int
-test_constant(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+template <typename T> void generate_random_inputs(std::vector<T>& v)
 {
-    cl_mem            streams[3];
-    cl_int            *tmpI;
-    cl_float        *tmpF, *out;
-    cl_program        program;
-    cl_kernel        kernel;
-    size_t    global_threads[3];
-    int                err;
-    unsigned int                i;
+    RandomSeed seed(gRandomSeed);
+
+    auto random_generator = [&seed]() {
+        return static_cast<T>(get_random_float(-0x02000000, 0x02000000, seed));
+    };
+
+    std::generate(v.begin(), v.end(), random_generator);
+}
+}
+
+int test_constant(cl_device_id device, cl_context context,
+                  cl_command_queue queue, int num_elements)
+{
+    clMemWrapper streams[3];
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+
+    size_t global_threads[3];
+    int err;
     cl_ulong maxSize, maxGlobalSize, maxAllocSize;
     size_t num_floats, num_ints, constant_values;
-    MTdata          d;
-    RoundingMode     oldRoundMode;
+    RoundingMode oldRoundMode;
     int isRTZ = 0;
 
-  /* Verify our test buffer won't be bigger than allowed */
-    err = clGetDeviceInfo( device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, 0 );
-    test_error( err, "Unable to get max constant buffer size" );
+    /* Verify our test buffer won't be bigger than allowed */
+    err = clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
+                          sizeof(maxSize), &maxSize, 0);
+    test_error(err, "Unable to get max constant buffer size");
+    log_info("Device reports CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE %llu bytes.\n",
+             maxSize);
 
-  log_info("Device reports CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE %llu bytes.\n", maxSize);
-  
-  // Limit test buffer size to 1/4 of CL_DEVICE_GLOBAL_MEM_SIZE
-  err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(maxGlobalSize), &maxGlobalSize, 0);
-  test_error(err, "Unable to get CL_DEVICE_GLOBAL_MEM_SIZE");
+    // Limit test buffer size to 1/4 of CL_DEVICE_GLOBAL_MEM_SIZE
+    err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE,
+                          sizeof(maxGlobalSize), &maxGlobalSize, 0);
+    test_error(err, "Unable to get CL_DEVICE_GLOBAL_MEM_SIZE");
 
-  if (maxSize > maxGlobalSize / 4)
-    maxSize = maxGlobalSize / 4;
+    maxSize = std::min(maxSize, maxGlobalSize / 4);
 
-  err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(maxAllocSize), &maxAllocSize, 0);
-  test_error(err, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE ");
+    err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                          sizeof(maxAllocSize), &maxAllocSize, 0);
+    test_error(err, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE");
 
-  if (maxSize > maxAllocSize)
-    maxSize = maxAllocSize;
-  
-  maxSize/=4;
-  num_ints = (size_t)maxSize/sizeof(cl_int);
-  num_floats = (size_t)maxSize/sizeof(cl_float);
-  if (num_ints >= num_floats) {
-    constant_values = num_floats;
-  } else {
-    constant_values = num_ints;
-  }
+    maxSize = std::min(maxSize, maxAllocSize);
 
-  log_info("Test will attempt to use %lu bytes with one %lu byte constant int buffer and one %lu byte constant float buffer.\n",
-           constant_values*sizeof(cl_int) + constant_values*sizeof(cl_float), constant_values*sizeof(cl_int), constant_values*sizeof(cl_float));
+    maxSize /= 4;
+    num_ints = static_cast<size_t>(maxSize / sizeof(cl_int));
+    num_floats = static_cast<size_t>(maxSize / sizeof(cl_float));
+    constant_values = std::min(num_floats, num_ints);
 
-    tmpI = (cl_int*)malloc(sizeof(cl_int) * constant_values);
-    tmpF = (cl_float*)malloc(sizeof(cl_float) * constant_values);
-    out  = (cl_float*)malloc(sizeof(cl_float) * constant_values);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_float) * constant_values, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_float) * constant_values, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * constant_values, NULL, NULL);
-    if (!streams[2])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
 
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<constant_values; i++) {
-        tmpI[i] = (int)get_random_float(-0x02000000, 0x02000000, d);
-        tmpF[i] = get_random_float(-0x02000000, 0x02000000, d);
-    }
-    free_mtdata(d); d = NULL;
+    log_info(
+        "Test will attempt to use %lu bytes with one %lu byte constant int "
+        "buffer and one %lu byte constant float buffer.\n",
+        constant_values * sizeof(cl_int) + constant_values * sizeof(cl_float),
+        constant_values * sizeof(cl_int), constant_values * sizeof(cl_float));
 
-    err = clEnqueueWriteBuffer(queue, streams[1], CL_TRUE, 0, sizeof(cl_float)*constant_values, (void *)tmpF, 0, NULL, NULL);
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-  err = clEnqueueWriteBuffer(queue, streams[2], CL_TRUE, 0, sizeof(cl_int)*constant_values, (void *)tmpI, 0, NULL, NULL);
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
+    std::vector<cl_int> tmpI(constant_values);
+    std::vector<cl_float> tmpF(constant_values);
+    std::vector<cl_float> out(constant_values);
 
-  err = create_single_kernel_helper(context, &program, &kernel, 1, &constant_kernel_code, "constant_kernel" );
-    if (err) {
-    log_error("Failed to create kernel and program: %d\n", err);
-    return -1;
-  }
+
+    streams[0] =
+        clCreateBuffer(context, CL_MEM_READ_WRITE,
+                       sizeof(cl_float) * constant_values, nullptr, &err);
+    test_error(err, "clCreateBuffer failed");
+
+    streams[1] =
+        clCreateBuffer(context, CL_MEM_READ_WRITE,
+                       sizeof(cl_float) * constant_values, nullptr, &err);
+    test_error(err, "clCreateBuffer failed");
+
+    streams[2] =
+        clCreateBuffer(context, CL_MEM_READ_WRITE,
+                       sizeof(cl_int) * constant_values, nullptr, &err);
+    test_error(err, "clCreateBuffer failed");
+
+    generate_random_inputs(tmpI);
+    generate_random_inputs(tmpF);
+
+    err = clEnqueueWriteBuffer(queue, streams[1], CL_TRUE, 0,
+                               sizeof(cl_float) * constant_values, tmpF.data(),
+                               0, nullptr, nullptr);
+    test_error(err, "clEnqueueWriteBuffer failed");
+    err = clEnqueueWriteBuffer(queue, streams[2], CL_TRUE, 0,
+                               sizeof(cl_int) * constant_values, tmpI.data(), 0,
+                               nullptr, nullptr);
+    test_error(err, "clEnqueueWriteBuffer faile.");
+
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &constant_kernel_code, "constant_kernel");
+    test_error(err, "Failed to create kernel and program");
 
 
     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1]);
     err |= clSetKernelArg(kernel, 2, sizeof streams[2], &streams[2]);
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
+    test_error(err, "clSetKernelArgs failed");
 
     global_threads[0] = constant_values;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, global_threads, NULL, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed: %d\n", err);
-        return -1;
-    }
-    err = clEnqueueReadBuffer( queue, streams[0], CL_TRUE, 0, sizeof(cl_float)*constant_values, (void *)out, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
+    err = clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, global_threads,
+                                 nullptr, 0, nullptr, nullptr);
+    test_error(err, "clEnqueueNDRangeKernel failed");
 
-    //If we only support rtz mode
-    if( CL_FP_ROUND_TO_ZERO == get_default_rounding_mode(device) && gIsEmbedded)
+    err = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0,
+                              sizeof(cl_float) * constant_values, out.data(), 0,
+                              nullptr, nullptr);
+    test_error(err, "clEnqueueReadBuffer failed");
+
+    // If we only support rtz mode
+    if (CL_FP_ROUND_TO_ZERO == get_default_rounding_mode(device) && gIsEmbedded)
     {
         oldRoundMode = set_round(kRoundTowardZero, kfloat);
         isRTZ = 1;
     }
 
-    err = verify(tmpF, tmpI, out, (int)constant_values);
+    err = verify(tmpF, tmpI, out);
 
-    if (isRTZ)
-        (void)set_round(oldRoundMode, kfloat);
+    if (isRTZ) (void)set_round(oldRoundMode, kfloat);
 
     // Loop constant buffer test
-    cl_program loop_program;
-    cl_kernel  loop_kernel;
+    clProgramWrapper loop_program;
+    clKernelWrapper loop_kernel;
     cl_int limit = 2;
 
-    memset(out, 0, sizeof(cl_float) * constant_values);
+    memset(out.data(), 0, sizeof(cl_float) * constant_values);
     err = create_single_kernel_helper(context, &loop_program, &loop_kernel, 1,
-                                      &loop_constant_kernel_code, "loop_constant_kernel" );
-    if (err) {
-        log_error("Failed to create loop kernel and program: %d\n", err);
-        return -1;
-    }
+                                      &loop_constant_kernel_code,
+                                      "loop_constant_kernel");
+    test_error(err, "Failed to create kernel and program");
 
     err = clSetKernelArg(loop_kernel, 0, sizeof streams[0], &streams[0]);
     err |= clSetKernelArg(loop_kernel, 1, sizeof streams[1], &streams[1]);
     err |= clSetKernelArg(loop_kernel, 2, sizeof(limit), &limit);
-    if (err != CL_SUCCESS) {
-        log_error("clSetKernelArgs for loop kernel failed\n");
-        return -1;
-    }
+    test_error(err, "clSetKernelArgs failed");
 
-    err = clEnqueueNDRangeKernel( queue, loop_kernel, 1, NULL, global_threads, NULL, 0, NULL, NULL );
-    if (err != CL_SUCCESS) {
-        log_error("clEnqueueNDRangeKernel failed: %d\n", err);
-        return -1;
-    }
-    err = clEnqueueReadBuffer( queue, streams[0], CL_TRUE, 0, sizeof(cl_float)*constant_values, (void *)out, 0, NULL, NULL );
-    if (err != CL_SUCCESS) {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
+    err = clEnqueueNDRangeKernel(queue, loop_kernel, 1, nullptr, global_threads,
+                                 nullptr, 0, nullptr, nullptr);
+    test_error(err, "clEnqueueNDRangeKernel failed");
 
-    err = verify_loop_constant(tmpF, out, limit, (int)constant_values);
+    err = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0,
+                              sizeof(cl_float) * constant_values, out.data(), 0,
+                              nullptr, nullptr);
+    test_error(err, "clEnqueueReadBuffer failed");
 
-    // cleanup
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseMemObject(streams[2]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    clReleaseKernel(loop_kernel);
-    clReleaseProgram(loop_program);
-    free(tmpI);
-    free(tmpF);
-    free(out);
+    err = verify_loop_constant(tmpF, out, limit);
+
 
     return err;
 }
-
-
-
-
-
diff --git a/test_conformance/basic/test_enqueue_map.cpp b/test_conformance/basic/test_enqueue_map.cpp
index d28f7e4..c2ea24e 100644
--- a/test_conformance/basic/test_enqueue_map.cpp
+++ b/test_conformance/basic/test_enqueue_map.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -26,6 +26,7 @@
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
 
+// clang-format off
 const cl_mem_flags flag_set[] = {
   CL_MEM_ALLOC_HOST_PTR,
   CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
@@ -33,93 +34,105 @@
   CL_MEM_COPY_HOST_PTR,
   0
 };
-const char* flag_set_names[] = {
+
+const char *flag_set_names[] = {
   "CL_MEM_ALLOC_HOST_PTR",
   "CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR",
   "CL_MEM_USE_HOST_PTR",
   "CL_MEM_COPY_HOST_PTR",
   "0"
 };
+// clang-format on
 
-int test_enqueue_map_buffer(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_enqueue_map_buffer(cl_device_id deviceID, cl_context context,
+                            cl_command_queue queue, int num_elements)
 {
     int error;
-    const size_t bufferSize = 256*256;
-    MTdataHolder d{gRandomSeed};
+    const size_t bufferSize = 256 * 256;
+    MTdataHolder d{ gRandomSeed };
     BufferOwningPtr<cl_char> hostPtrData{ malloc(bufferSize) };
     BufferOwningPtr<cl_char> referenceData{ malloc(bufferSize) };
-    BufferOwningPtr<cl_char> finalData{malloc(bufferSize)};
+    BufferOwningPtr<cl_char> finalData{ malloc(bufferSize) };
 
-    for (int src_flag_id=0; src_flag_id < ARRAY_SIZE(flag_set); src_flag_id++)
+    for (size_t src_flag_id = 0; src_flag_id < ARRAY_SIZE(flag_set);
+         src_flag_id++)
     {
         clMemWrapper memObject;
-        log_info("Testing with cl_mem_flags src: %s\n", flag_set_names[src_flag_id]);
+        log_info("Testing with cl_mem_flags src: %s\n",
+                 flag_set_names[src_flag_id]);
 
         generate_random_data(kChar, (unsigned int)bufferSize, d, hostPtrData);
         memcpy(referenceData, hostPtrData, bufferSize);
 
         void *hostPtr = nullptr;
         cl_mem_flags flags = flag_set[src_flag_id];
-        bool hasHostPtr = (flags & CL_MEM_USE_HOST_PTR) || (flags & CL_MEM_COPY_HOST_PTR);
+        bool hasHostPtr =
+            (flags & CL_MEM_USE_HOST_PTR) || (flags & CL_MEM_COPY_HOST_PTR);
         if (hasHostPtr) hostPtr = hostPtrData;
-        memObject = clCreateBuffer(context, flags,  bufferSize, hostPtr, &error);
-        test_error( error, "Unable to create testing buffer" );
+        memObject = clCreateBuffer(context, flags, bufferSize, hostPtr, &error);
+        test_error(error, "Unable to create testing buffer");
 
         if (!hasHostPtr)
         {
             error =
-            clEnqueueWriteBuffer(queue, memObject, CL_TRUE, 0, bufferSize,
-                                 hostPtrData, 0, NULL, NULL);
-            test_error( error, "clEnqueueWriteBuffer failed");
+                clEnqueueWriteBuffer(queue, memObject, CL_TRUE, 0, bufferSize,
+                                     hostPtrData, 0, NULL, NULL);
+            test_error(error, "clEnqueueWriteBuffer failed");
         }
 
-        for( int i = 0; i < 128; i++ )
+        for (int i = 0; i < 128; i++)
         {
 
-          size_t offset = (size_t)random_in_range( 0, (int)bufferSize - 1, d );
-          size_t length = (size_t)random_in_range( 1, (int)( bufferSize - offset ), d );
+            size_t offset = (size_t)random_in_range(0, (int)bufferSize - 1, d);
+            size_t length =
+                (size_t)random_in_range(1, (int)(bufferSize - offset), d);
 
-          cl_char *mappedRegion = (cl_char *)clEnqueueMapBuffer( queue, memObject, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
-                                                                offset, length, 0, NULL, NULL, &error );
-          if( error != CL_SUCCESS )
-          {
-            print_error( error, "clEnqueueMapBuffer call failed" );
-            log_error( "\tOffset: %d  Length: %d\n", (int)offset, (int)length );
-            return -1;
-          }
+            cl_char *mappedRegion = (cl_char *)clEnqueueMapBuffer(
+                queue, memObject, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset,
+                length, 0, NULL, NULL, &error);
+            if (error != CL_SUCCESS)
+            {
+                print_error(error, "clEnqueueMapBuffer call failed");
+                log_error("\tOffset: %d  Length: %d\n", (int)offset,
+                          (int)length);
+                return -1;
+            }
 
-          // Write into the region
-          for( size_t j = 0; j < length; j++ )
-          {
-            cl_char spin = (cl_char)genrand_int32( d );
+            // Write into the region
+            for (size_t j = 0; j < length; j++)
+            {
+                cl_char spin = (cl_char)genrand_int32(d);
 
-            // Test read AND write in one swipe
-            cl_char value = mappedRegion[ j ];
-            value = spin - value;
-            mappedRegion[ j ] = value;
+                // Test read AND write in one swipe
+                cl_char value = mappedRegion[j];
+                value = spin - value;
+                mappedRegion[j] = value;
 
-            // Also update the initial data array
-            value = referenceData[offset + j];
-            value = spin - value;
-            referenceData[offset + j] = value;
-          }
+                // Also update the initial data array
+                value = referenceData[offset + j];
+                value = spin - value;
+                referenceData[offset + j] = value;
+            }
 
-          // Unmap
-          error = clEnqueueUnmapMemObject( queue, memObject, mappedRegion, 0, NULL, NULL );
-          test_error( error, "Unable to unmap buffer" );
+            // Unmap
+            error = clEnqueueUnmapMemObject(queue, memObject, mappedRegion, 0,
+                                            NULL, NULL);
+            test_error(error, "Unable to unmap buffer");
         }
 
-        // Final validation: read actual values of buffer and compare against our reference
-        error = clEnqueueReadBuffer( queue, memObject, CL_TRUE, 0, bufferSize, finalData, 0, NULL, NULL );
-        test_error( error, "Unable to read results" );
+        // Final validation: read actual values of buffer and compare against
+        // our reference
+        error = clEnqueueReadBuffer(queue, memObject, CL_TRUE, 0, bufferSize,
+                                    finalData, 0, NULL, NULL);
+        test_error(error, "Unable to read results");
 
-        for( size_t q = 0; q < bufferSize; q++ )
+        for (size_t q = 0; q < bufferSize; q++)
         {
             if (referenceData[q] != finalData[q])
             {
                 log_error(
-                "ERROR: Sample %d did not validate! Got %d, expected %d\n",
-                (int)q, (int)finalData[q], (int)referenceData[q]);
+                    "ERROR: Sample %d did not validate! Got %d, expected %d\n",
+                    (int)q, (int)finalData[q], (int)referenceData[q]);
                 return -1;
             }
         }
@@ -128,112 +141,129 @@
     return 0;
 }
 
-int test_enqueue_map_image(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_enqueue_map_image(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements)
 {
     int error;
     cl_image_format format = { CL_RGBA, CL_UNSIGNED_INT32 };
     const size_t imageSize = 256;
     const size_t imageDataSize = imageSize * imageSize * 4 * sizeof(cl_uint);
 
-    PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID )
+    PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID)
 
     BufferOwningPtr<cl_uint> hostPtrData{ malloc(imageDataSize) };
     BufferOwningPtr<cl_uint> referenceData{ malloc(imageDataSize) };
-    BufferOwningPtr<cl_uint> finalData{malloc(imageDataSize)};
+    BufferOwningPtr<cl_uint> finalData{ malloc(imageDataSize) };
 
-    MTdataHolder d{gRandomSeed};
-  for (int src_flag_id=0; src_flag_id < ARRAY_SIZE(flag_set); src_flag_id++) {
-    clMemWrapper memObject;
-    log_info("Testing with cl_mem_flags src: %s\n", flag_set_names[src_flag_id]);
-
-    generate_random_data(kUInt, (unsigned int)(imageSize * imageSize * 4), d,
-                         hostPtrData);
-    memcpy(referenceData, hostPtrData, imageDataSize);
-
-    cl_mem_flags flags = flag_set[src_flag_id];
-    bool hasHostPtr = (flags & CL_MEM_USE_HOST_PTR) || (flags & CL_MEM_COPY_HOST_PTR);
-    void *hostPtr = nullptr;
-    if (hasHostPtr) hostPtr = hostPtrData;
-    memObject = create_image_2d(context, CL_MEM_READ_WRITE | flags, &format,
-                                imageSize, imageSize, 0, hostPtr, &error );
-    test_error( error, "Unable to create testing buffer" );
-
-    if (!hasHostPtr) {
-      size_t write_origin[3]={0,0,0}, write_region[3]={imageSize, imageSize, 1};
-      error =
-      clEnqueueWriteImage(queue, memObject, CL_TRUE, write_origin, write_region,
-                          0, 0, hostPtrData, 0, NULL, NULL);
-      test_error( error, "Unable to write to testing buffer" );
-    }
-
-    for( int i = 0; i < 128; i++ )
+    MTdataHolder d{ gRandomSeed };
+    for (size_t src_flag_id = 0; src_flag_id < ARRAY_SIZE(flag_set);
+         src_flag_id++)
     {
+        clMemWrapper memObject;
+        log_info("Testing with cl_mem_flags src: %s\n",
+                 flag_set_names[src_flag_id]);
 
-      size_t offset[3], region[3];
-      size_t rowPitch;
+        generate_random_data(kUInt, (unsigned int)(imageSize * imageSize * 4),
+                             d, hostPtrData);
+        memcpy(referenceData, hostPtrData, imageDataSize);
 
-      offset[ 0 ] = (size_t)random_in_range( 0, (int)imageSize - 1, d );
-      region[ 0 ] = (size_t)random_in_range( 1, (int)( imageSize - offset[ 0 ] - 1), d );
-      offset[ 1 ] = (size_t)random_in_range( 0, (int)imageSize - 1, d );
-      region[ 1 ] = (size_t)random_in_range( 1, (int)( imageSize - offset[ 1 ] - 1), d );
-      offset[ 2 ] = 0;
-      region[ 2 ] = 1;
-      cl_uint *mappedRegion = (cl_uint *)clEnqueueMapImage( queue, memObject, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
-                                                           offset, region, &rowPitch, NULL, 0, NULL, NULL, &error );
-      if( error != CL_SUCCESS )
-      {
-        print_error( error, "clEnqueueMapImage call failed" );
-        log_error( "\tOffset: %d,%d  Region: %d,%d\n", (int)offset[0], (int)offset[1], (int)region[0], (int)region[1] );
-        return -1;
-      }
+        cl_mem_flags flags = flag_set[src_flag_id];
+        bool hasHostPtr =
+            (flags & CL_MEM_USE_HOST_PTR) || (flags & CL_MEM_COPY_HOST_PTR);
+        void *hostPtr = nullptr;
+        if (hasHostPtr) hostPtr = hostPtrData;
+        memObject = create_image_2d(context, CL_MEM_READ_WRITE | flags, &format,
+                                    imageSize, imageSize, 0, hostPtr, &error);
+        test_error(error, "Unable to create testing buffer");
 
-      // Write into the region
-      cl_uint *mappedPtr = mappedRegion;
-      for( size_t y = 0; y < region[ 1 ]; y++ )
-      {
-        for( size_t x = 0; x < region[ 0 ] * 4; x++ )
+        if (!hasHostPtr)
         {
-          cl_int spin = (cl_int)random_in_range( 16, 1024, d );
-
-          cl_int value;
-          // Test read AND write in one swipe
-          value = mappedPtr[ ( y * rowPitch/sizeof(cl_uint) ) + x ];
-          value = spin - value;
-          mappedPtr[ ( y * rowPitch/sizeof(cl_uint) ) + x ] = value;
-
-          // Also update the initial data array
-          value =
-          referenceData[((offset[1] + y) * imageSize + offset[0]) * 4 + x];
-          value = spin - value;
-          referenceData[((offset[1] + y) * imageSize + offset[0]) * 4 + x] =
-          value;
+            size_t write_origin[3] = { 0, 0, 0 },
+                   write_region[3] = { imageSize, imageSize, 1 };
+            error = clEnqueueWriteImage(queue, memObject, CL_TRUE, write_origin,
+                                        write_region, 0, 0, hostPtrData, 0,
+                                        NULL, NULL);
+            test_error(error, "Unable to write to testing buffer");
         }
-      }
 
-      // Unmap
-      error = clEnqueueUnmapMemObject( queue, memObject, mappedRegion, 0, NULL, NULL );
-      test_error( error, "Unable to unmap buffer" );
-    }
-
-    // Final validation: read actual values of buffer and compare against our reference
-    size_t finalOrigin[3] = { 0, 0, 0 }, finalRegion[3] = { imageSize, imageSize, 1 };
-    error = clEnqueueReadImage( queue, memObject, CL_TRUE, finalOrigin, finalRegion, 0, 0, finalData, 0, NULL, NULL );
-    test_error( error, "Unable to read results" );
-
-    for( size_t q = 0; q < imageSize * imageSize * 4; q++ )
-    {
-        if (referenceData[q] != finalData[q])
+        for (int i = 0; i < 128; i++)
         {
-            log_error("ERROR: Sample %d (coord %d,%d) did not validate! Got "
-                      "%d, expected %d\n",
-                      (int)q, (int)((q / 4) % imageSize),
-                      (int)((q / 4) / imageSize), (int)finalData[q],
-                      (int)referenceData[q]);
-            return -1;
+
+            size_t offset[3], region[3];
+            size_t rowPitch;
+
+            offset[0] = (size_t)random_in_range(0, (int)imageSize - 1, d);
+            region[0] =
+                (size_t)random_in_range(1, (int)(imageSize - offset[0] - 1), d);
+            offset[1] = (size_t)random_in_range(0, (int)imageSize - 1, d);
+            region[1] =
+                (size_t)random_in_range(1, (int)(imageSize - offset[1] - 1), d);
+            offset[2] = 0;
+            region[2] = 1;
+            cl_uint *mappedRegion = (cl_uint *)clEnqueueMapImage(
+                queue, memObject, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset,
+                region, &rowPitch, NULL, 0, NULL, NULL, &error);
+            if (error != CL_SUCCESS)
+            {
+                print_error(error, "clEnqueueMapImage call failed");
+                log_error("\tOffset: %d,%d  Region: %d,%d\n", (int)offset[0],
+                          (int)offset[1], (int)region[0], (int)region[1]);
+                return -1;
+            }
+
+            // Write into the region
+            cl_uint *mappedPtr = mappedRegion;
+            for (size_t y = 0; y < region[1]; y++)
+            {
+                for (size_t x = 0; x < region[0] * 4; x++)
+                {
+                    cl_int spin = (cl_int)random_in_range(16, 1024, d);
+
+                    cl_int value;
+                    // Test read AND write in one swipe
+                    value = mappedPtr[(y * rowPitch / sizeof(cl_uint)) + x];
+                    value = spin - value;
+                    mappedPtr[(y * rowPitch / sizeof(cl_uint)) + x] = value;
+
+                    // Also update the initial data array
+                    value =
+                        referenceData[((offset[1] + y) * imageSize + offset[0])
+                                          * 4
+                                      + x];
+                    value = spin - value;
+                    referenceData[((offset[1] + y) * imageSize + offset[0]) * 4
+                                  + x] = value;
+                }
+            }
+
+            // Unmap
+            error = clEnqueueUnmapMemObject(queue, memObject, mappedRegion, 0,
+                                            NULL, NULL);
+            test_error(error, "Unable to unmap buffer");
         }
-    }
-  } // cl_mem_flags
+
+        // Final validation: read actual values of buffer and compare against
+        // our reference
+        size_t finalOrigin[3] = { 0, 0, 0 },
+               finalRegion[3] = { imageSize, imageSize, 1 };
+        error = clEnqueueReadImage(queue, memObject, CL_TRUE, finalOrigin,
+                                   finalRegion, 0, 0, finalData, 0, NULL, NULL);
+        test_error(error, "Unable to read results");
+
+        for (size_t q = 0; q < imageSize * imageSize * 4; q++)
+        {
+            if (referenceData[q] != finalData[q])
+            {
+                log_error(
+                    "ERROR: Sample %d (coord %d,%d) did not validate! Got "
+                    "%d, expected %d\n",
+                    (int)q, (int)((q / 4) % imageSize),
+                    (int)((q / 4) / imageSize), (int)finalData[q],
+                    (int)referenceData[q]);
+                return -1;
+            }
+        }
+    } // cl_mem_flags
 
     return 0;
 }
-
diff --git a/test_conformance/basic/test_fpmath.cpp b/test_conformance/basic/test_fpmath.cpp
new file mode 100644
index 0000000..9bdb192
--- /dev/null
+++ b/test_conformance/basic/test_fpmath.cpp
@@ -0,0 +1,386 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "harness/compat.h"
+#include "harness/rounding_mode.h"
+#include "harness/stringHelpers.h"
+
+#include <CL/cl_half.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "procs.h"
+
+static const char *fp_kernel_code = R"(
+%s
+__kernel void test_fp(__global TYPE *srcA, __global TYPE *srcB, __global TYPE *dst)
+{
+    int  tid = get_global_id(0);
+
+    dst[tid] = srcA[tid] OP srcB[tid];
+})";
+
+extern cl_half_rounding_mode halfRoundingMode;
+
+#define HFF(num) cl_half_from_float(num, halfRoundingMode)
+#define HTF(num) cl_half_to_float(num)
+
+template <typename T> double toDouble(T val)
+{
+    if (std::is_same<cl_half, T>::value)
+        return HTF(val);
+    else
+        return val;
+}
+
+bool isHalfNan(cl_half v)
+{
+    // Extract FP16 exponent and mantissa
+    uint16_t h_exp = (v >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
+    uint16_t h_mant = v & 0x3FF;
+
+    // NaN test
+    return (h_exp == 0x1F && h_mant != 0);
+}
+
+cl_half half_plus(cl_half a, cl_half b)
+{
+    return HFF(std::plus<float>()(HTF(a), HTF(b)));
+}
+
+cl_half half_minus(cl_half a, cl_half b)
+{
+    return HFF(std::minus<float>()(HTF(a), HTF(b)));
+}
+
+cl_half half_mult(cl_half a, cl_half b)
+{
+    return HFF(std::multiplies<float>()(HTF(a), HTF(b)));
+}
+
+template <typename T> struct TestDef
+{
+    const char op;
+    std::function<T(T, T)> ref;
+    std::string type_str;
+    size_t vec_size;
+};
+
+template <typename T>
+int verify_fp(std::vector<T> (&input)[2], std::vector<T> &output,
+              const TestDef<T> &test)
+{
+    auto &inA = input[0];
+    auto &inB = input[1];
+    for (size_t i = 0; i < output.size(); i++)
+    {
+        bool nan_test = false;
+
+        T r = test.ref(inA[i], inB[i]);
+
+        if (std::is_same<T, cl_half>::value)
+            nan_test = !(isHalfNan(r) && isHalfNan(output[i]));
+
+        if (r != output[i] && nan_test)
+        {
+            log_error("FP math test for type: %s, vec size: %zu, failed at "
+                      "index %zu, %a '%c' %a, expected %a, get %a\n",
+                      test.type_str.c_str(), test.vec_size, i, toDouble(inA[i]),
+                      test.op, toDouble(inB[i]), toDouble(r),
+                      toDouble(output[i]));
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+template <typename T> void generate_random_inputs(std::vector<T> (&input)[2])
+{
+    RandomSeed seed(gRandomSeed);
+
+    if (std::is_same<T, float>::value)
+    {
+        auto random_generator = [&seed]() {
+            return get_random_float(-MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31),
+                                    MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), seed);
+        };
+        for (auto &v : input)
+            std::generate(v.begin(), v.end(), random_generator);
+    }
+    else if (std::is_same<T, double>::value)
+    {
+        auto random_generator = [&seed]() {
+            return get_random_double(-MAKE_HEX_DOUBLE(0x1.0p63, 0x1LL, 63),
+                                     MAKE_HEX_DOUBLE(0x1.0p63, 0x1LL, 63),
+                                     seed);
+        };
+        for (auto &v : input)
+            std::generate(v.begin(), v.end(), random_generator);
+    }
+    else
+    {
+        auto random_generator = [&seed]() {
+            return HFF(get_random_float(-MAKE_HEX_FLOAT(0x1.0p8f, 0x1, 8),
+                                        MAKE_HEX_FLOAT(0x1.0p8f, 0x1, 8),
+                                        seed));
+        };
+        for (auto &v : input)
+            std::generate(v.begin(), v.end(), random_generator);
+    }
+}
+
+struct TypesIterator
+{
+    using TypeIter = std::tuple<cl_float, cl_half, cl_double>;
+
+    TypesIterator(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elems)
+        : context(context), queue(queue), fpConfigHalf(0), fpConfigFloat(0),
+          num_elements(num_elems)
+    {
+        // typeid().name one day
+        type2name[sizeof(cl_half)] = "half";
+        type2name[sizeof(cl_float)] = "float";
+        type2name[sizeof(cl_double)] = "double";
+
+        fp16Support = is_extension_available(deviceID, "cl_khr_fp16");
+        fp64Support = is_extension_available(deviceID, "cl_khr_fp64");
+
+        fpConfigFloat = get_default_rounding_mode(deviceID);
+
+        if (fp16Support)
+            fpConfigHalf =
+                get_default_rounding_mode(deviceID, CL_DEVICE_HALF_FP_CONFIG);
+
+        for_each_elem(it);
+    }
+
+    template <typename T> int test_fpmath(TestDef<T> &test)
+    {
+        constexpr size_t vecSizes[] = { 1, 2, 4, 8, 16 };
+        cl_int err = CL_SUCCESS;
+
+        std::ostringstream sstr;
+        if (std::is_same<T, double>::value)
+            sstr << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+
+        if (std::is_same<T, cl_half>::value)
+            sstr << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+
+        std::string program_source =
+            str_sprintf(std::string(fp_kernel_code), sstr.str().c_str());
+
+        for (unsigned i = 0; i < ARRAY_SIZE(vecSizes); i++)
+        {
+            test.vec_size = vecSizes[i];
+
+            std::ostringstream vecNameStr;
+            vecNameStr << test.type_str;
+            if (test.vec_size != 1) vecNameStr << test.vec_size;
+
+            clMemWrapper streams[3];
+            clProgramWrapper program;
+            clKernelWrapper kernel;
+
+            size_t length = sizeof(T) * num_elements * test.vec_size;
+
+            bool isRTZ = false;
+            RoundingMode oldMode = kDefaultRoundingMode;
+
+
+            // If we only support rtz mode
+            if (std::is_same<T, cl_half>::value)
+            {
+                if (CL_FP_ROUND_TO_ZERO == fpConfigHalf)
+                {
+                    isRTZ = true;
+                    oldMode = get_round();
+                }
+            }
+            else if (std::is_same<T, float>::value)
+            {
+                if (CL_FP_ROUND_TO_ZERO == fpConfigFloat)
+                {
+                    isRTZ = true;
+                    oldMode = get_round();
+                }
+            }
+
+            std::vector<T> inputs[]{
+                std::vector<T>(test.vec_size * num_elements),
+                std::vector<T>(test.vec_size * num_elements)
+            };
+            std::vector<T> output =
+                std::vector<T>(test.vec_size * num_elements);
+
+            generate_random_inputs<T>(inputs);
+
+            for (size_t i = 0; i < ARRAY_SIZE(streams); i++)
+            {
+                streams[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, length,
+                                            NULL, &err);
+                test_error(err, "clCreateBuffer failed.");
+            }
+            for (size_t i = 0; i < ARRAY_SIZE(inputs); i++)
+            {
+                err =
+                    clEnqueueWriteBuffer(queue, streams[i], CL_TRUE, 0, length,
+                                         inputs[i].data(), 0, NULL, NULL);
+                test_error(err, "clEnqueueWriteBuffer failed.");
+            }
+
+            std::string build_options = "-DTYPE=";
+            build_options.append(vecNameStr.str())
+                .append(" -DOP=")
+                .append(1, test.op);
+
+            const char *ptr = program_source.c_str();
+            err =
+                create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                            "test_fp", build_options.c_str());
+
+            test_error(err, "create_single_kernel_helper failed");
+
+            for (size_t i = 0; i < ARRAY_SIZE(streams); i++)
+            {
+                err =
+                    clSetKernelArg(kernel, i, sizeof(streams[i]), &streams[i]);
+                test_error(err, "clSetKernelArgs failed.");
+            }
+
+            size_t threads[] = { static_cast<size_t>(num_elements) };
+            err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, NULL,
+                                         0, NULL, NULL);
+            test_error(err, "clEnqueueNDRangeKernel failed.");
+
+            err = clEnqueueReadBuffer(queue, streams[2], CL_TRUE, 0, length,
+                                      output.data(), 0, NULL, NULL);
+            test_error(err, "clEnqueueReadBuffer failed.");
+
+            if (isRTZ) set_round(kRoundTowardZero, kfloat);
+
+            err = verify_fp(inputs, output, test);
+
+            if (isRTZ) set_round(oldMode, kfloat);
+
+            test_error(err, "test verification failed");
+            log_info("FP '%c' '%s' test passed\n", test.op,
+                     vecNameStr.str().c_str());
+        }
+
+        return err;
+    }
+
+    template <typename T> int test_fpmath_common()
+    {
+        int err = TEST_PASS;
+        if (std::is_same<cl_half, T>::value)
+        {
+            TestDef<T> tests[] = { { '+', half_plus, type2name[sizeof(T)] },
+                                   { '-', half_minus, type2name[sizeof(T)] },
+                                   { '*', half_mult, type2name[sizeof(T)] } };
+            for (auto &test : tests) err |= test_fpmath<T>(test);
+        }
+        else
+        {
+            TestDef<T> tests[] = {
+                { '+', std::plus<T>(), type2name[sizeof(T)] },
+                { '-', std::minus<T>(), type2name[sizeof(T)] },
+                { '*', std::multiplies<T>(), type2name[sizeof(T)] }
+            };
+            for (auto &test : tests) err |= test_fpmath<T>(test);
+        }
+
+        return err;
+    }
+
+    template <typename T> bool skip_type()
+    {
+        if (std::is_same<double, T>::value && !fp64Support)
+            return true;
+        else if (std::is_same<cl_half, T>::value && !fp16Support)
+            return true;
+        return false;
+    }
+
+    template <std::size_t Cnt = 0, typename Type>
+    void iterate_type(const Type &t)
+    {
+        bool doTest = !skip_type<Type>();
+
+        if (doTest)
+        {
+            if (test_fpmath_common<Type>())
+            {
+                throw std::runtime_error("test_fpmath_common failed\n");
+            }
+        }
+    }
+
+    template <std::size_t Cnt = 0, typename... Tp>
+    inline typename std::enable_if<Cnt == sizeof...(Tp), void>::type
+    for_each_elem(
+        const std::tuple<Tp...> &) // Unused arguments are given no names.
+    {}
+
+    template <std::size_t Cnt = 0, typename... Tp>
+        inline typename std::enable_if < Cnt<sizeof...(Tp), void>::type
+        for_each_elem(const std::tuple<Tp...> &t)
+    {
+        iterate_type<Cnt>(std::get<Cnt>(t));
+        for_each_elem<Cnt + 1, Tp...>(t);
+    }
+
+protected:
+    TypeIter it;
+
+    cl_context context;
+    cl_command_queue queue;
+
+    cl_device_fp_config fpConfigHalf;
+    cl_device_fp_config fpConfigFloat;
+
+    bool fp16Support;
+    bool fp64Support;
+
+    int num_elements;
+    std::map<size_t, std::string> type2name;
+};
+
+int test_fpmath(cl_device_id device, cl_context context, cl_command_queue queue,
+                int num_elements)
+{
+    try
+    {
+        TypesIterator(device, context, queue, num_elements);
+    } catch (const std::runtime_error &e)
+    {
+        log_error("%s", e.what());
+        return TEST_FAIL;
+    }
+
+    return TEST_PASS;
+}
diff --git a/test_conformance/basic/test_fpmath_float.cpp b/test_conformance/basic/test_fpmath_float.cpp
deleted file mode 100644
index fced0f4..0000000
--- a/test_conformance/basic/test_fpmath_float.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "harness/compat.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include "harness/rounding_mode.h"
-
-#include <algorithm>
-#include <functional>
-#include <string>
-#include <vector>
-
-#include "procs.h"
-
-struct TestDef
-{
-    const char op;
-    std::function<float(float, float)> ref;
-};
-
-static const char *fp_kernel_code = R"(
-__kernel void test_fp(__global TYPE *srcA, __global TYPE *srcB, __global TYPE *dst)
-{
-    int  tid = get_global_id(0);
-
-    dst[tid] = srcA[tid] OP srcB[tid];
-})";
-
-static int verify_fp(std::vector<float> (&input)[2], std::vector<float> &output,
-                     const TestDef &test)
-{
-
-    auto &inA = input[0];
-    auto &inB = input[1];
-    for (int i = 0; i < output.size(); i++)
-    {
-        float r = test.ref(inA[i], inB[i]);
-        if (r != output[i])
-        {
-            log_error("FP '%c' float test failed\n", test.op);
-            return -1;
-        }
-    }
-
-    log_info("FP '%c' float test passed\n", test.op);
-    return 0;
-}
-
-
-void generate_random_inputs(std::vector<cl_float> (&input)[2])
-{
-    RandomSeed seed(gRandomSeed);
-
-    auto random_generator = [&seed]() {
-        return get_random_float(-MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31),
-                                MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), seed);
-    };
-
-    for (auto &v : input)
-    {
-        std::generate(v.begin(), v.end(), random_generator);
-    }
-}
-
-template <size_t N>
-int test_fpmath(cl_device_id device, cl_context context, cl_command_queue queue,
-                int num_elements, const std::string type_str,
-                const TestDef &test)
-{
-    clMemWrapper streams[3];
-    clProgramWrapper program;
-    clKernelWrapper kernel;
-
-    int err;
-
-    size_t length = sizeof(cl_float) * num_elements * N;
-
-    int isRTZ = 0;
-    RoundingMode oldMode = kDefaultRoundingMode;
-
-    // If we only support rtz mode
-    if (CL_FP_ROUND_TO_ZERO == get_default_rounding_mode(device))
-    {
-        isRTZ = 1;
-        oldMode = get_round();
-    }
-
-
-    std::vector<cl_float> inputs[]{ std::vector<cl_float>(N * num_elements),
-                                    std::vector<cl_float>(N * num_elements) };
-    std::vector<cl_float> output = std::vector<cl_float>(N * num_elements);
-
-    generate_random_inputs(inputs);
-
-    for (int i = 0; i < ARRAY_SIZE(streams); i++)
-    {
-        streams[i] =
-            clCreateBuffer(context, CL_MEM_READ_WRITE, length, NULL, &err);
-        test_error(err, "clCreateBuffer failed.");
-    }
-    for (int i = 0; i < ARRAY_SIZE(inputs); i++)
-    {
-        err = clEnqueueWriteBuffer(queue, streams[i], CL_TRUE, 0, length,
-                                   inputs[i].data(), 0, NULL, NULL);
-        test_error(err, "clEnqueueWriteBuffer failed.");
-    }
-
-    std::string build_options = "-DTYPE=";
-    build_options.append(type_str).append(" -DOP=").append(1, test.op);
-
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &fp_kernel_code, "test_fp",
-                                      build_options.c_str());
-
-    test_error(err, "create_single_kernel_helper failed");
-
-    for (int i = 0; i < ARRAY_SIZE(streams); i++)
-    {
-        err = clSetKernelArg(kernel, i, sizeof(streams[i]), &streams[i]);
-        test_error(err, "clSetKernelArgs failed.");
-    }
-
-    size_t threads[] = { static_cast<size_t>(num_elements) };
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, NULL, 0, NULL,
-                                 NULL);
-    test_error(err, "clEnqueueNDRangeKernel failed.");
-
-    err = clEnqueueReadBuffer(queue, streams[2], CL_TRUE, 0, length,
-                              output.data(), 0, NULL, NULL);
-    test_error(err, "clEnqueueReadBuffer failed.");
-
-    if (isRTZ) set_round(kRoundTowardZero, kfloat);
-
-    err = verify_fp(inputs, output, test);
-
-    if (isRTZ) set_round(oldMode, kfloat);
-
-    return err;
-}
-
-
-template <size_t N>
-int test_fpmath_common(cl_device_id device, cl_context context,
-                       cl_command_queue queue, int num_elements,
-                       const std::string type_str)
-{
-    TestDef tests[] = { { '+', std::plus<float>() },
-                        { '-', std::minus<float>() },
-                        { '*', std::multiplies<float>() } };
-    int err = TEST_PASS;
-
-    for (const auto &test : tests)
-    {
-        err |= test_fpmath<N>(device, context, queue, num_elements, type_str,
-                              test);
-    }
-
-    return err;
-}
-
-int test_fpmath_float(cl_device_id device, cl_context context,
-                      cl_command_queue queue, int num_elements)
-{
-    return test_fpmath_common<1>(device, context, queue, num_elements, "float");
-}
-
-int test_fpmath_float2(cl_device_id device, cl_context context,
-                       cl_command_queue queue, int num_elements)
-{
-    return test_fpmath_common<2>(device, context, queue, num_elements,
-                                 "float2");
-}
-
-int test_fpmath_float4(cl_device_id device, cl_context context,
-                       cl_command_queue queue, int num_elements)
-{
-    return test_fpmath_common<4>(device, context, queue, num_elements,
-                                 "float4");
-}
diff --git a/test_conformance/basic/test_get_linear_ids.cpp b/test_conformance/basic/test_get_linear_ids.cpp
index 3496fd0..ee7dfb2 100644
--- a/test_conformance/basic/test_get_linear_ids.cpp
+++ b/test_conformance/basic/test_get_linear_ids.cpp
@@ -104,15 +104,19 @@
 
         switch (dims) {
         case 1:
-            log_info("  testing offset=%u global=%u local=%u...\n", gwo[0], gws[0], lws[0]);
+            log_info("  testing offset=%zu global=%zu local=%zu...\n", gwo[0],
+                     gws[0], lws[0]);
             break;
         case 2:
-            log_info("  testing offset=(%u,%u) global=(%u,%u) local=(%u,%u)...\n",
-                    gwo[0], gwo[1], gws[0], gws[1], lws[0], lws[1]);
+            log_info("  testing offset=(%zu,%zu) global=(%zu,%zu) "
+                     "local=(%zu,%zu)...\n",
+                     gwo[0], gwo[1], gws[0], gws[1], lws[0], lws[1]);
             break;
         case 3:
-            log_info("  testing offset=(%u,%u,%u) global=(%u,%u,%u) local=(%u,%u,%u)...\n",
-                    gwo[0], gwo[1], gwo[2], gws[0], gws[1], gws[2], lws[0], lws[1], lws[2]);
+            log_info("  testing offset=(%zu,%zu,%zu) global=(%zu,%zu,%zu) "
+                     "local=(%zu,%zu,%zu)...\n",
+                     gwo[0], gwo[1], gwo[2], gws[0], gws[1], gws[2], lws[0],
+                     lws[1], lws[2]);
             break;
         }
 
diff --git a/test_conformance/basic/test_hiloeo.cpp b/test_conformance/basic/test_hiloeo.cpp
index 3470ad0..4e921a6 100644
--- a/test_conformance/basic/test_hiloeo.cpp
+++ b/test_conformance/basic/test_hiloeo.cpp
@@ -1,6 +1,6 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
+// Copyright (c) 2023 The Khronos Group Inc.
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -13,14 +13,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "harness/compat.h"
-
+#include <iomanip>
+#include <limits.h>
 #include <stdio.h>
 #include <string.h>
-#include <limits.h>
 #include <sys/types.h>
 #include <sys/stat.h>
-
+#include <vector>
 
 #include "procs.h"
 
@@ -31,9 +30,10 @@
 
 typedef int (*OffsetFunc)( int index, int vectorSize );
 static const OffsetFunc offsetFuncs[4] = { hi_offset, lo_offset, even_offset, odd_offset };
-typedef int (*verifyFunc)( const void *, const void *, const void *, int n, const char *sizeName );
 static const char *operatorToUse_names[] = { "hi", "lo", "even", "odd" };
-static const char *test_str_names[] = { "char", "uchar", "short", "ushort", "int", "uint", "long", "ulong", "float", "double" };
+static const char *test_str_names[] = { "char", "uchar", "short", "ushort",
+                                        "int",  "uint",  "long",  "ulong",
+                                        "half", "float", "double" };
 
 static const unsigned int vector_sizes[] =     { 1, 2, 3, 4, 8, 16};
 static const unsigned int vector_aligns[] =    { 1, 2, 4, 4, 8, 16};
@@ -45,43 +45,41 @@
 // strcat(gentype, vector_size_names[out_vector_idx[i]]);
 static const char *vector_size_names[] = { "", "2", "3", "4", "8", "16"};
 
-static const size_t  kSizes[] = { 1, 1, 2, 2, 4, 4, 8, 8, 4, 8 };
+static const size_t kSizes[] = { 1, 1, 2, 2, 4, 4, 8, 8, 2, 4, 8 };
 static int CheckResults( void *in, void *out, size_t elementCount, int type, int vectorSize, int operatorToUse );
 
 int test_hiloeo(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
 {
-    cl_int *input_ptr, *output_ptr, *p;
     int err;
-    cl_uint i;
     int hasDouble = is_extension_available( device, "cl_khr_fp64" );
+    int hasHalf = is_extension_available(device, "cl_khr_fp16");
     cl_uint vectorSize, operatorToUse;
     cl_uint type;
-    MTdata d;
+    MTdataHolder d(gRandomSeed);
 
     int expressionMode;
     int numExpressionModes = 2;
 
     size_t length = sizeof(cl_int) * 4 * n_elems;
 
-    input_ptr   = (cl_int*)malloc(length);
-    output_ptr  = (cl_int*)malloc(length);
+    std::vector<cl_int> input_ptr(4 * n_elems);
+    std::vector<cl_int> output_ptr(4 * n_elems);
 
-    p = input_ptr;
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<4 * (cl_uint) n_elems; i++)
-        p[i] = genrand_int32(d);
-    free_mtdata(d); d = NULL;
+    for (cl_uint i = 0; i < 4 * (cl_uint)n_elems; i++)
+        input_ptr[i] = genrand_int32(d);
 
     for( type = 0; type < sizeof( test_str_names ) / sizeof( test_str_names[0] ); type++ )
     {
         // Note: restrict the element count here so we don't end up overrunning the output buffer if we're compensating for 32-bit writes
         size_t elementCount = length / kSizes[type];
-        cl_mem streams[2];
+        clMemWrapper streams[2];
 
         // skip double if unavailable
         if( !hasDouble && ( 0 == strcmp( test_str_names[type], "double" )))
             continue;
 
+        if (!hasHalf && (0 == strcmp(test_str_names[type], "half"))) continue;
+
         if( !gHasLong &&
             (( 0 == strcmp( test_str_names[type], "long" )) ||
             ( 0 == strcmp( test_str_names[type], "ulong" ))))
@@ -104,12 +102,9 @@
             return -1;
         }
 
-        err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, length, input_ptr, 0, NULL, NULL);
-        if (err != CL_SUCCESS)
-        {
-            log_error("clEnqueueWriteBuffer failed\n");
-            return -1;
-        }
+        err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, length,
+                                   input_ptr.data(), 0, NULL, NULL);
+        test_error(err, "clEnqueueWriteBuffer failed\n");
 
         for( operatorToUse = 0; operatorToUse < sizeof( operatorToUse_names ) / sizeof( operatorToUse_names[0] ); operatorToUse++ )
         {
@@ -118,8 +113,8 @@
             for( vectorSize = 1; vectorSize < sizeof( vector_size_names ) / sizeof( vector_size_names[0] ); vectorSize++ ) {
                 for(expressionMode = 0; expressionMode < numExpressionModes; ++expressionMode) {
 
-                    cl_program program = NULL;
-                    cl_kernel kernel = NULL;
+                    clProgramWrapper program;
+                    clKernelWrapper kernel;
                     cl_uint outVectorSize = out_vector_idx[vectorSize];
                     char expression[1024];
 
@@ -139,92 +134,64 @@
                         "}\n"
                     };
 
-                    if(expressionMode == 0) {
-                        sprintf(expression, "srcA[tid]");
-                    } else if(expressionMode == 1) {
-                        switch(vector_sizes[vectorSize]) {
-                            case 16:
-                                sprintf(expression,
-                                        "((%s16)(srcA[tid].s0, srcA[tid].s1, srcA[tid].s2, srcA[tid].s3, srcA[tid].s4, srcA[tid].s5, srcA[tid].s6, srcA[tid].s7, srcA[tid].s8, srcA[tid].s9, srcA[tid].sA, srcA[tid].sB, srcA[tid].sC, srcA[tid].sD, srcA[tid].sE, srcA[tid].sf))",
-                                        test_str_names[type]
-                                        );
-                                break;
-                            case 8:
-                                sprintf(expression,
-                                        "((%s8)(srcA[tid].s0, srcA[tid].s1, srcA[tid].s2, srcA[tid].s3, srcA[tid].s4, srcA[tid].s5, srcA[tid].s6, srcA[tid].s7))",
-                                        test_str_names[type]
-                                        );
-                                break;
-                            case 4:
-                                sprintf(expression,
-                                        "((%s4)(srcA[tid].s0, srcA[tid].s1, srcA[tid].s2, srcA[tid].s3))",
-                                        test_str_names[type]
-                                        );
-                                break;
-                            case 3:
-                                sprintf(expression,
-                                        "((%s3)(srcA[tid].s0, srcA[tid].s1, srcA[tid].s2))",
-                                        test_str_names[type]
-                                        );
-                                break;
-                            case 2:
-                                sprintf(expression,
-                                        "((%s2)(srcA[tid].s0, srcA[tid].s1))",
-                                        test_str_names[type]
-                                        );
-                                break;
-                            default :
-                                sprintf(expression, "srcA[tid]");
-                                log_info("Default\n");
-                        }
-                    } else {
-                        sprintf(expression, "srcA[tid]");
+                    if (expressionMode == 1 && vector_sizes[vectorSize] != 1)
+                    {
+                        std::ostringstream sstr;
+                        const char *index_chars[] = { "0", "1", "2", "3",
+                                                      "4", "5", "6", "7",
+                                                      "8", "9", "A", "B",
+                                                      "C", "D", "E", "f" };
+                        sstr << "((" << test_str_names[type]
+                             << std::to_string(vector_sizes[vectorSize])
+                             << ")(";
+                        for (unsigned i = 0; i < vector_sizes[vectorSize]; i++)
+                            sstr << " srcA[tid].s" << index_chars[i] << ",";
+                        sstr.seekp(-1, sstr.cur);
+                        sstr << "))";
+                        std::snprintf(expression, sizeof(expression), "%s",
+                                      sstr.str().c_str());
+                    }
+                    else
+                    {
+                        std::snprintf(expression, sizeof(expression),
+                                      "srcA[tid]");
                     }
 
                     if (0 == strcmp( test_str_names[type], "double" ))
                         source[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
 
+                    if (0 == strcmp(test_str_names[type], "half"))
+                        source[0] =
+                            "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+
                     char kernelName[128];
                     snprintf( kernelName, sizeof( kernelName ), "test_%s_%s%s", operatorToUse_names[ operatorToUse ], test_str_names[type], vector_size_names[vectorSize] );
                     err = create_single_kernel_helper(context, &program, &kernel, sizeof( source ) / sizeof( source[0] ), source, kernelName );
-                    if (err)
-                        return -1;
+                    test_error(err, "create_single_kernel_helper failed\n");
 
                     err  = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
                     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1]);
-                    if (err != CL_SUCCESS)
-                    {
-                        log_error("clSetKernelArgs failed\n");
-                        return -1;
-                    }
+                    test_error(err, "clSetKernelArg failed\n");
 
                     //Wipe the output buffer clean
                     uint32_t pattern = 0xdeadbeef;
-                    memset_pattern4( output_ptr, &pattern, length );
-                    err = clEnqueueWriteBuffer(queue, streams[1], CL_TRUE, 0, length, output_ptr, 0, NULL, NULL);
-                    if (err != CL_SUCCESS)
-                    {
-                        log_error("clEnqueueWriteBuffer failed\n");
-                        return -1;
-                    }
+                    memset_pattern4(output_ptr.data(), &pattern, length);
+                    err = clEnqueueWriteBuffer(queue, streams[1], CL_TRUE, 0,
+                                               length, output_ptr.data(), 0,
+                                               NULL, NULL);
+                    test_error(err, "clEnqueueWriteBuffer failed\n");
 
                     size_t size = elementCount / (vector_aligns[vectorSize]);
                     err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &size, NULL, 0, NULL, NULL);
-                    if (err != CL_SUCCESS)
-                    {
-                        log_error("clEnqueueNDRangeKernel failed\n");
-                        return -1;
-                    }
+                    test_error(err, "clEnqueueNDRangeKernel failed\n");
 
-                    err = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, length, output_ptr, 0, NULL, NULL);
-                    if (err != CL_SUCCESS)
-                    {
-                        log_error("clEnqueueReadBuffer failed\n");
-                        return -1;
-                    }
+                    err = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0,
+                                              length, output_ptr.data(), 0,
+                                              NULL, NULL);
+                    test_error(err, "clEnqueueReadBuffer failed\n");
 
-                    char *inP = (char *)input_ptr;
-                    char *outP = (char *)output_ptr;
+                    char *inP = (char *)input_ptr.data();
+                    char *outP = (char *)output_ptr.data();
                     outP += kSizes[type] * ( ( vector_sizes[outVectorSize] ) -
                                             ( vector_sizes[ out_vector_idx[vectorSize] ] ) );
                     // was                outP += kSizes[type] * ( ( 1 << outVectorSize ) - ( 1 << ( vectorSize - 1 ) ) );
@@ -240,180 +207,88 @@
                         inP += kSizes[type] * ( vector_aligns[vectorSize] );
                         outP += kSizes[type] * ( vector_aligns[outVectorSize] );
                     }
-
-                    clReleaseKernel( kernel );
-                    clReleaseProgram( program );
                     log_info( "." );
                     fflush( stdout );
                 }
             }
         }
-
-        clReleaseMemObject( streams[0] );
-        clReleaseMemObject( streams[1] );
         log_info( "done\n" );
     }
 
     log_info("HiLoEO test passed\n");
-
-    free(input_ptr);
-    free(output_ptr);
-
     return err;
 }
 
-static int CheckResults( void *in, void *out, size_t elementCount, int type, int vectorSize, int operatorToUse )
+template <typename T>
+cl_int verify(void *in, void *out, size_t elementCount, int type,
+              int vectorSize, int operatorToUse, size_t cmpVectorSize)
 {
-    cl_ulong  array[8];
+    size_t halfVectorSize = vector_sizes[out_vector_idx[vectorSize]];
+    size_t elementSize = kSizes[type];
+    OffsetFunc f = offsetFuncs[operatorToUse];
+    cl_ulong array[8];
     void *p = array;
-    size_t halfVectorSize  = vector_sizes[out_vector_idx[vectorSize]];
-    size_t cmpVectorSize =  vector_sizes[out_vector_idx[vectorSize]];
-    // was 1 << (vectorSize-1);
-    OffsetFunc f = offsetFuncs[ operatorToUse ];
-    size_t elementSize =  kSizes[type];
 
-    if(vector_size_names[vectorSize][0] == '3') {
-        if(operatorToUse_names[operatorToUse][0] == 'h' ||
-           operatorToUse_names[operatorToUse][0] == 'o') // hi or odd
+    std::ostringstream ss;
+
+    T *i = (T *)in, *o = (T *)out;
+
+    for (cl_uint k = 0; k < elementCount; k++)
+    {
+        T *o2 = (T *)p;
+        for (size_t j = 0; j < halfVectorSize; j++)
+            o2[j] = i[f((int)j, (int)halfVectorSize * 2)];
+
+        if (memcmp(o, o2, elementSize * cmpVectorSize))
+        {
+            ss << "\n"
+               << k << ") Failure for" << test_str_names[type]
+               << vector_size_names[vectorSize] << '.'
+               << operatorToUse_names[operatorToUse] << " { "
+               << "0x" << std::setfill('0') << std::setw(elementSize * 2)
+               << std::hex << i[0];
+
+            for (size_t j = 1; j < halfVectorSize * 2; j++) ss << ", " << i[j];
+            ss << " } --> { " << o[0];
+            for (size_t j = 1; j < halfVectorSize; j++) ss << ", " << o[j];
+            ss << " }\n";
+            return -1;
+        }
+        i += 2 * halfVectorSize;
+        o += halfVectorSize;
+    }
+    return 0;
+}
+
+static int CheckResults(void *in, void *out, size_t elementCount, int type,
+                        int vectorSize, int operatorToUse)
+{
+    size_t cmpVectorSize = vector_sizes[out_vector_idx[vectorSize]];
+    size_t elementSize = kSizes[type];
+
+    if (vector_size_names[vectorSize][0] == '3')
+    {
+        if (operatorToUse_names[operatorToUse][0] == 'h'
+            || operatorToUse_names[operatorToUse][0] == 'o') // hi or odd
         {
             cmpVectorSize = 1; // special case for vec3 ignored values
         }
     }
 
-    switch( elementSize )
+    switch (elementSize)
     {
         case 1:
-        {
-            char *i = (char*)in;
-            char *o = (char*)out;
-            size_t j;
-            cl_uint k;
-            OffsetFunc f = offsetFuncs[ operatorToUse ];
-
-            for( k = 0; k  < elementCount; k++ )
-            {
-                char *o2 = (char*)p;
-                for( j = 0; j < halfVectorSize; j++ )
-                    o2[j] = i[ f((int)j, (int)halfVectorSize*2) ];
-
-                if( memcmp( o, o2, elementSize * cmpVectorSize ) )
-                {
-                    log_info( "\n%d) Failure for %s%s.%s { %d", k, test_str_names[type], vector_size_names[ vectorSize ], operatorToUse_names[ operatorToUse ], i[0] );
-                    for( j = 1; j < halfVectorSize * 2; j++ )
-                        log_info( ", %d", i[j] );
-                    log_info( " } --> { %d", o[0] );
-                    for( j = 1; j < halfVectorSize; j++ )
-                        log_info( ", %d", o[j] );
-                    log_info( " }\n" );
-                    return -1;
-                }
-                i += 2 * halfVectorSize;
-                o += halfVectorSize;
-            }
-        }
-            break;
-
+            return verify<char>(in, out, elementCount, type, vectorSize,
+                                operatorToUse, cmpVectorSize);
         case 2:
-        {
-            short *i = (short*)in;
-            short *o = (short*)out;
-            size_t j;
-            cl_uint k;
-
-            for( k = 0; k  < elementCount; k++ )
-            {
-                short *o2 = (short*)p;
-                for( j = 0; j < halfVectorSize; j++ )
-                    o2[j] = i[ f((int)j, (int)halfVectorSize*2) ];
-
-                if( memcmp( o, o2, elementSize * cmpVectorSize ) )
-                {
-                    log_info( "\n%d) Failure for %s%s.%s { %d", k, test_str_names[type], vector_size_names[ vectorSize ], operatorToUse_names[ operatorToUse ], i[0] );
-                    for( j = 1; j < halfVectorSize * 2; j++ )
-                        log_info( ", %d", i[j] );
-                    log_info( " } --> { %d", o[0] );
-                    for( j = 1; j < halfVectorSize; j++ )
-                        log_info( ", %d", o[j] );
-                    log_info( " }\n" );
-                    return -1;
-                }
-                i += 2 * halfVectorSize;
-                o += halfVectorSize;
-            }
-        }
-            break;
-
+            return verify<short>(in, out, elementCount, type, vectorSize,
+                                 operatorToUse, cmpVectorSize);
         case 4:
-        {
-            int *i = (int*)in;
-            int *o = (int*)out;
-            size_t j;
-            cl_uint k;
-
-            for( k = 0; k  < elementCount; k++ )
-            {
-                int *o2 = (int *)p;
-                for( j = 0; j < halfVectorSize; j++ )
-                    o2[j] = i[ f((int)j, (int)halfVectorSize*2) ];
-
-                for( j = 0; j < cmpVectorSize; j++ )
-        {
-            /* Allow float nans to be binary different */
-            if( memcmp( &o[j], &o2[j], elementSize ) && !((strcmp(test_str_names[type], "float") == 0) && isnan(((float *)o)[j]) && isnan(((float *)o2)[j])))
-            {
-                log_info( "\n%d) Failure for %s%s.%s { 0x%8.8x", k, test_str_names[type], vector_size_names[ vectorSize ], operatorToUse_names[ operatorToUse ], i[0] );
-            for( j = 1; j < halfVectorSize * 2; j++ )
-                log_info( ", 0x%8.8x", i[j] );
-            log_info( " } --> { 0x%8.8x", o[0] );
-            for( j = 1; j < halfVectorSize; j++ )
-                log_info( ", 0x%8.8x", o[j] );
-            log_info( " }\n" );
-            return -1;
-            }
-        }
-        i += 2 * halfVectorSize;
-        o += halfVectorSize;
-            }
-        }
-            break;
-
+            return verify<int>(in, out, elementCount, type, vectorSize,
+                               operatorToUse, cmpVectorSize);
         case 8:
-        {
-            cl_ulong *i = (cl_ulong*)in;
-            cl_ulong *o = (cl_ulong*)out;
-            size_t j;
-            cl_uint k;
-
-            for( k = 0; k  < elementCount; k++ )
-            {
-                cl_ulong *o2 = (cl_ulong*)p;
-                for( j = 0; j < halfVectorSize; j++ )
-                    o2[j] = i[ f((int)j, (int)halfVectorSize*2) ];
-
-                if( memcmp( o, o2, elementSize * cmpVectorSize ) )
-                {
-                    log_info( "\n%d) Failure for %s%s.%s { 0x%16.16llx", k, test_str_names[type], vector_size_names[ vectorSize ], operatorToUse_names[ operatorToUse ], i[0] );
-                    for( j = 1; j < halfVectorSize * 2; j++ )
-                        log_info( ", 0x%16.16llx", i[j] );
-                    log_info( " } --> { 0x%16.16llx", o[0] );
-                    for( j = 1; j < halfVectorSize; j++ )
-                        log_info( ", 0x%16.16llx", o[j] );
-                    log_info( " }\n" );
-                    return -1;
-                }
-                i += 2 * halfVectorSize;
-                o += halfVectorSize;
-            }
-        }
-            break;
-
-        default:
-            log_info( "Internal error. Unknown data type\n" );
-            return -2;
+            return verify<cl_ulong>(in, out, elementCount, type, vectorSize,
+                                    operatorToUse, cmpVectorSize);
+        default: log_info("Internal error. Unknown data type\n"); return -2;
     }
-
-    return 0;
 }
-
-
-
diff --git a/test_conformance/basic/test_image_r8.cpp b/test_conformance/basic/test_image_r8.cpp
index b633d6a..2dca161 100644
--- a/test_conformance/basic/test_image_r8.cpp
+++ b/test_conformance/basic/test_image_r8.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -21,163 +21,111 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <algorithm>
+#include <vector>
 
 #include "procs.h"
 
-static const char *r_uint8_kernel_code =
-"__kernel void test_r_uint8(read_only image2d_t srcimg, __global unsigned char *dst, sampler_t sampler)\n"
-"{\n"
-"    int    tid_x = get_global_id(0);\n"
-"    int    tid_y = get_global_id(1);\n"
-"    int    indx = tid_y * get_image_width(srcimg) + tid_x;\n"
-"    uint4    color;\n"
-"\n"
-"    color = read_imageui(srcimg, sampler, (int2)(tid_x, tid_y));\n"
-"    dst[indx] = (unsigned char)(color.x);\n"
-"\n"
-"}\n";
-
-
-static unsigned char *
-generate_8bit_image(int w, int h, MTdata d)
+namespace {
+const char *r_uint8_kernel_code = R"(
+__kernel void test_r_uint8(read_only image2d_t srcimg, __global unsigned char *dst, sampler_t sampler)
 {
-    unsigned char    *ptr = (unsigned char*)malloc(w * h * sizeof(unsigned char));
-    int             i;
+    int    tid_x = get_global_id(0);
+    int    tid_y = get_global_id(1);
+    int    indx = tid_y * get_image_width(srcimg) + tid_x;
+    uint4  color;
 
-    for (i=0; i<w*h; i++)
-      ptr[i] = (unsigned char)genrand_int32(d);
+    color = read_imageui(srcimg, sampler, (int2)(tid_x, tid_y));
+    dst[indx] = (unsigned char)(color.x);
+})";
 
-    return ptr;
+
+void generate_random_inputs(std::vector<cl_uchar> &v)
+{
+    RandomSeed seed(gRandomSeed);
+
+    auto random_generator = [&seed]() {
+        return static_cast<cl_uchar>(genrand_int32(seed));
+    };
+
+    std::generate(v.begin(), v.end(), random_generator);
 }
 
-static int
-verify_8bit_image(unsigned char *image, unsigned char *outptr, int w, int h)
-{
-    int     i;
-
-    for (i=0; i<w*h; i++)
-    {
-        if (outptr[i] != image[i])
-        {
-            log_error("READ_IMAGE_R_UNSIGNED_INT8 test failed\n");
-            return -1;
-        }
-    }
-
-    log_info("READ_IMAGE_R_UNSIGNED_INT8 test passed\n");
-    return 0;
 }
-
-int
-test_image_r8(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+int test_image_r8(cl_device_id device, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    cl_mem            streams[2];
-    cl_image_format    img_format;
-    cl_uchar    *input_ptr, *output_ptr;
-    cl_program        program;
-    cl_kernel        kernel;
-    size_t    threads[3];
-    int                img_width = 512;
-    int                img_height = 512;
-    int                err;
-    MTdata          d;
+    clMemWrapper streams[2];
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    const size_t img_width = 512;
+    const size_t img_height = 512;
+    const size_t length = img_width * img_height;
+    int err;
 
-    PASSIVE_REQUIRE_IMAGE_SUPPORT( device )
+    PASSIVE_REQUIRE_IMAGE_SUPPORT(device)
 
-    img_format.image_channel_order = CL_R;
-    img_format.image_channel_data_type = CL_UNSIGNED_INT8;
+    const cl_image_format img_format = { CL_R, CL_UNSIGNED_INT8 };
 
     // early out if this image type is not supported
     if (!is_image_format_supported(context, CL_MEM_READ_ONLY,
                                    CL_MEM_OBJECT_IMAGE2D, &img_format))
     {
         log_info("WARNING: Image type not supported; skipping test.\n");
-        return 0;
+        return TEST_SKIPPED_ITSELF;
     }
 
-    d = init_genrand( gRandomSeed );
-    input_ptr = generate_8bit_image(img_width, img_height, d);
-    free_mtdata(d); d = NULL;
+    std::vector<cl_uchar> input(length);
+    std::vector<cl_uchar> output(length);
 
-    output_ptr = (cl_uchar*)malloc(sizeof(cl_uchar) * img_width * img_height);
+    generate_random_inputs(input);
+
     streams[0] = create_image_2d(context, CL_MEM_READ_ONLY, &img_format,
-                                 img_width, img_height, 0, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("create_image_2d failed\n");
-        return -1;
-    }
+                                 img_width, img_height, 0, nullptr, &err);
+    test_error(err, "create_image_2d failed.");
 
     streams[1] =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(cl_uchar) * img_width * img_height, NULL, NULL);
-    if (!streams[1])
+        clCreateBuffer(context, CL_MEM_READ_WRITE, length, nullptr, &err);
+    test_error(err, "clCreateBuffer failed.");
+
+    const size_t origin[3] = { 0, 0, 0 },
+                 region[3] = { img_width, img_height, 1 };
+    err = clEnqueueWriteImage(queue, streams[0], CL_TRUE, origin, region, 0, 0,
+                              input.data(), 0, nullptr, nullptr);
+    test_error(err, "clEnqueueWriteImage failed.");
+
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &r_uint8_kernel_code, "test_r_uint8");
+    test_error(err, "create_single_kernel_helper failed.");
+
+    clSamplerWrapper sampler = clCreateSampler(
+        context, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &err);
+    test_error(err, "clCreateSampler failed");
+
+    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
+    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1]);
+    err |= clSetKernelArg(kernel, 2, sizeof sampler, &sampler);
+    test_error(err, "clSetKernelArgs failed\n");
+
+    size_t threads[] = { img_width, img_height };
+    err = clEnqueueNDRangeKernel(queue, kernel, 2, nullptr, threads, nullptr, 0,
+                                 nullptr, nullptr);
+    test_error(err, "clEnqueueNDRangeKernel failed\n");
+
+
+    err = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, length,
+                              output.data(), 0, nullptr, nullptr);
+    test_error(err, "clEnqueueReadBuffer failed\n");
+
+    if (0 != memcmp(input.data(), output.data(), length))
     {
-        log_error("clCreateBuffer failed\n");
-        return -1;
+        log_error("READ_IMAGE_R_UNSIGNED_INT8 test failed\n");
+        err = -1;
     }
-
-    size_t origin[3] = {0,0,0}, region[3]={img_width, img_height, 1};
-    err = clEnqueueWriteImage(queue, streams[0], CL_TRUE,
-                            origin, region, 0, 0,
-                            input_ptr,
-                            0, NULL, NULL);
-    if (err != CL_SUCCESS)
+    else
     {
-        log_error("clWriteImage failed: %d\n", err);
-        return -1;
+        log_info("READ_IMAGE_R_UNSIGNED_INT8 test passed\n");
     }
 
-  err = create_single_kernel_helper(context, &program, &kernel, 1, &r_uint8_kernel_code, "test_r_uint8" );
-    if (err) {
-    log_error("Failed to create kernel and program: %d\n", err);
-    return -1;
-  }
-
-  cl_sampler sampler = clCreateSampler(context, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &err);
-  test_error(err, "clCreateSampler failed");
-
-  err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
-  err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1]);
-  err |= clSetKernelArg(kernel, 2, sizeof sampler, &sampler);
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed: %d\n", err);
-        return -1;
-    }
-
-    threads[0] = (size_t)img_width;
-    threads[1] = (size_t)img_height;
-    err = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, threads, NULL, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-
-    err = clEnqueueReadBuffer( queue, streams[1], CL_TRUE, 0, sizeof(cl_uchar)*img_width*img_height, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-
-    err = verify_8bit_image(input_ptr, output_ptr, img_width, img_height);
-
-
-    // cleanup
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    clReleaseSampler(sampler);
-    free(input_ptr);
-    free(output_ptr);
-
     return err;
 }
-
-
-
-
-
diff --git a/test_conformance/basic/test_int2float.cpp b/test_conformance/basic/test_int2float.cpp
deleted file mode 100644
index 3a8458c..0000000
--- a/test_conformance/basic/test_int2float.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "harness/compat.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-
-#include "procs.h"
-
-const char *int2float_kernel_code =
-"__kernel void test_int2float(__global int *src, __global float *dst)\n"
-"{\n"
-"    int  tid = get_global_id(0);\n"
-"\n"
-"    dst[tid] = (float)src[tid];\n"
-"\n"
-"}\n";
-
-
-int
-verify_int2float(cl_int *inptr, cl_float *outptr, int n)
-{
-    int     i;
-
-    for (i=0; i<n; i++)
-    {
-        if (outptr[i] != (float)inptr[i])
-        {
-            log_error("INT2FLOAT test failed\n");
-            return -1;
-        }
-    }
-
-    log_info("INT2FLOAT test passed\n");
-    return 0;
-}
-
-int
-test_int2float(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    cl_mem            streams[2];
-    cl_int            *input_ptr;
-    cl_float        *output_ptr;
-    cl_program        program;
-    cl_kernel        kernel;
-    size_t    threads[1];
-    int                err;
-    int                i;
-    MTdata          d;
-
-    input_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements);
-    output_ptr = (cl_float*)malloc(sizeof(cl_float) * num_elements);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_float) * num_elements, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        input_ptr[i] = (cl_int)get_random_float(-MAKE_HEX_FLOAT( 0x1.0p31f, 0x1, 31), MAKE_HEX_FLOAT( 0x1.0p31f, 0x1, 31), d);
-    free_mtdata(d); d = NULL;
-
-    err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, sizeof(cl_int)*num_elements, (void *)input_ptr, 0, NULL, NULL);
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-
-    err = create_single_kernel_helper(context, &program, &kernel, 1, &int2float_kernel_code, "test_int2float");
-    if (err != CL_SUCCESS)
-    {
-        log_error("create_single_kernel_helper failed\n");
-        return -1;
-    }
-
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1]);
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
-
-    threads[0] = (size_t)num_elements;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, NULL, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
-    }
-
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_float)*num_elements, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
-
-    err = verify_int2float(input_ptr, output_ptr, num_elements);
-
-    // cleanup
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr);
-    free(output_ptr);
-
-    return err;
-}
-
-
-
-
-
diff --git a/test_conformance/basic/test_int2fp.cpp b/test_conformance/basic/test_int2fp.cpp
new file mode 100644
index 0000000..dd5cc9a
--- /dev/null
+++ b/test_conformance/basic/test_int2fp.cpp
@@ -0,0 +1,325 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "CL/cl_half.h"
+#include "harness/compat.h"
+#include "harness/errorHelpers.h"
+#include "harness/stringHelpers.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <map>
+#include <vector>
+
+#include "procs.h"
+
+extern cl_half_rounding_mode halfRoundingMode;
+
+#define HFF(num) cl_half_from_float(num, halfRoundingMode)
+#define HTF(num) cl_half_to_float(num)
+
+namespace {
+const char *int2float_kernel_code = R"(
+%s
+__kernel void test_X2Y(__global TYPE_X *src, __global TYPE_Y *dst)
+{
+    int  tid = get_global_id(0);
+
+    dst[tid] = (TYPE_Y)src[tid];
+
+})";
+
+template <bool int2fp> struct TypesIterator
+{
+    TypesIterator(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elems, const char *test_name)
+        : context(context), queue(queue), test_name(test_name),
+          num_elements(num_elems)
+    {
+        fp16Support = is_extension_available(deviceID, "cl_khr_fp16");
+        fp64Support = is_extension_available(deviceID, "cl_khr_fp64");
+
+        type2name[sizeof(cl_half)] = std::make_pair("half", "short");
+        type2name[sizeof(cl_float)] = std::make_pair("float", "int");
+        type2name[sizeof(cl_double)] = std::make_pair("double", "long");
+
+        std::tuple<cl_float, cl_half, cl_double> it;
+        for_each_elem(it);
+    }
+
+    template <typename T> void generate_random_inputs(std::vector<T> &v)
+    {
+        RandomSeed seed(gRandomSeed);
+
+        if (sizeof(T) == sizeof(cl_half))
+        {
+            // Bound generated half values to 0x1.ffcp+14(32752.0) which is the
+            // largest cl_half value smaller than the max value of cl_short,
+            // 32767.
+            if (int2fp)
+            {
+                auto random_generator = [&seed]() {
+                    return (cl_short)get_random_float(
+                        -MAKE_HEX_FLOAT(0x1.ffcp+14, 1.9990234375f, 14),
+                        MAKE_HEX_FLOAT(0x1.ffcp+14, 1.9990234375f, 14), seed);
+                };
+                std::generate(v.begin(), v.end(), random_generator);
+            }
+            else
+            {
+                auto random_generator = [&seed]() {
+                    return HFF(get_random_float(
+                        -MAKE_HEX_FLOAT(0x1.ffcp+14, 1.9990234375f, 14),
+                        MAKE_HEX_FLOAT(0x1.ffcp+14, 1.9990234375f, 14), seed));
+                };
+                std::generate(v.begin(), v.end(), random_generator);
+            }
+        }
+        else if (sizeof(T) == sizeof(cl_float))
+        {
+            auto random_generator = [&seed]() {
+                return get_random_float(-MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31),
+                                        MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31),
+                                        seed);
+            };
+            std::generate(v.begin(), v.end(), random_generator);
+        }
+        else if (sizeof(T) == sizeof(cl_double))
+        {
+            auto random_generator = [&seed]() {
+                return get_random_double(-MAKE_HEX_DOUBLE(0x1.0p63, 0x1, 63),
+                                         MAKE_HEX_DOUBLE(0x1.0p63, 0x1, 63),
+                                         seed);
+            };
+            std::generate(v.begin(), v.end(), random_generator);
+        }
+    }
+
+    template <typename Tx, typename Ty> static bool equal_value(Tx a, Ty b)
+    {
+        return a == (Tx)b;
+    }
+
+    static bool equal_value_from_half(cl_short a, cl_half b)
+    {
+        return a == (cl_short)HTF(b);
+    }
+
+    static bool equal_value_to_half(cl_half a, cl_short b)
+    {
+        return a == HFF((float)b);
+    }
+
+
+    template <typename Tx, typename Ty>
+    int verify_X2Y(std::vector<Tx> input, std::vector<Ty> output)
+    {
+        if (std::is_same<Tx, cl_half>::value
+            || std::is_same<Ty, cl_half>::value)
+        {
+            bool res = true;
+            if (int2fp)
+                res = std::equal(output.begin(), output.end(), input.begin(),
+                                 equal_value_to_half);
+            else
+                res = std::equal(output.begin(), output.end(), input.begin(),
+                                 equal_value_from_half);
+
+            if (!res)
+            {
+                log_error("%s test failed\n", test_name.c_str());
+                return -1;
+            }
+        }
+        else
+        {
+            if (!std::equal(output.begin(), output.end(), input.begin(),
+                            equal_value<Tx, Ty>))
+            {
+                log_error("%s test failed\n", test_name.c_str());
+                return -1;
+            }
+        }
+
+        log_info("%s test passed\n", test_name.c_str());
+        return 0;
+    }
+
+    template <typename Tx, typename Ty> int test_X2Y()
+    {
+        clMemWrapper streams[2];
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+        int err;
+
+        std::vector<Tx> input(num_elements);
+        std::vector<Ty> output(num_elements);
+
+        streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                                    sizeof(Tx) * num_elements, nullptr, &err);
+        test_error(err, "clCreateBuffer failed.");
+        streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                                    sizeof(Ty) * num_elements, nullptr, &err);
+        test_error(err, "clCreateBuffer failed.");
+
+        generate_random_inputs(input);
+
+        err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0,
+                                   sizeof(Tx) * num_elements, input.data(), 0,
+                                   nullptr, nullptr);
+        test_error(err, "clEnqueueWriteBuffer failed.");
+
+        std::string src_name = type2name[sizeof(Tx)].first;
+        std::string dst_name = type2name[sizeof(Tx)].second;
+        if (int2fp) std::swap(src_name, dst_name);
+
+        std::string build_options;
+        build_options.append("-DTYPE_X=").append(src_name.c_str());
+        build_options.append(" -DTYPE_Y=").append(dst_name.c_str());
+
+        std::string extension;
+        if (sizeof(Tx) == sizeof(cl_double))
+            extension = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+
+        if (sizeof(Tx) == sizeof(cl_half))
+            extension = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+
+        std::string kernelSource =
+            str_sprintf(int2float_kernel_code, extension.c_str());
+        const char *ptr = kernelSource.c_str();
+
+        err = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                          "test_X2Y", build_options.c_str());
+        test_error(err, "create_single_kernel_helper failed.");
+
+        err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
+        err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1]);
+        test_error(err, "clSetKernelArg failed.");
+
+        size_t threads[] = { (size_t)num_elements };
+        err = clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, threads,
+                                     nullptr, 0, nullptr, nullptr);
+        test_error(err, "clEnqueueNDRangeKernel failed.");
+
+        err = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0,
+                                  sizeof(Ty) * num_elements, output.data(), 0,
+                                  nullptr, nullptr);
+        test_error(err, "clEnqueueReadBuffer failed.");
+
+        err = verify_X2Y(input, output);
+
+        return err;
+    }
+
+    template <typename T> bool skip_type()
+    {
+        if (std::is_same<double, T>::value && !fp64Support)
+            return true;
+        else if (std::is_same<cl_half, T>::value && !fp16Support)
+            return true;
+        return false;
+    }
+
+    template <std::size_t Cnt = 0, typename T> void iterate_type(const T &t)
+    {
+        bool doTest = !skip_type<T>();
+
+        if (doTest)
+        {
+            typedef typename std::conditional<
+                (sizeof(T) == sizeof(std::int16_t)), std::int16_t,
+                typename std::conditional<(sizeof(T) == sizeof(std::int32_t)),
+                                          std::int32_t,
+                                          std::int64_t>::type>::type U;
+            if (int2fp)
+            {
+                if (test_X2Y<U, T>())
+                    throw std::runtime_error("test_X2Y failed\n");
+            }
+            else
+            {
+                if (test_X2Y<T, U>())
+                    throw std::runtime_error("test_X2Y failed\n");
+            }
+        }
+    }
+
+    template <std::size_t Cnt = 0, typename... Tp>
+    inline typename std::enable_if<Cnt == sizeof...(Tp), void>::type
+    for_each_elem(
+        const std::tuple<Tp...> &) // Unused arguments are given no names.
+    {}
+
+    template <std::size_t Cnt = 0, typename... Tp>
+        inline typename std::enable_if < Cnt<sizeof...(Tp), void>::type
+        for_each_elem(const std::tuple<Tp...> &t)
+    {
+        iterate_type<Cnt>(std::get<Cnt>(t));
+        for_each_elem<Cnt + 1, Tp...>(t);
+    }
+
+protected:
+    cl_context context;
+    cl_command_queue queue;
+
+    cl_device_fp_config fpConfigHalf;
+    cl_device_fp_config fpConfigFloat;
+
+    bool fp16Support;
+    bool fp64Support;
+
+    std::map<size_t, std::pair<std::string, std::string>> type2name;
+
+    std::string test_name;
+    int num_elements;
+};
+
+}
+
+int test_int2fp(cl_device_id device, cl_context context, cl_command_queue queue,
+                int num_elements)
+{
+    try
+    {
+        TypesIterator<true>(device, context, queue, num_elements, "INT2FP");
+    } catch (const std::runtime_error &e)
+    {
+        log_error("%s", e.what());
+        return TEST_FAIL;
+    }
+
+    return TEST_PASS;
+}
+
+int test_fp2int(cl_device_id device, cl_context context, cl_command_queue queue,
+                int num_elements)
+{
+    try
+    {
+        TypesIterator<false>(device, context, queue, num_elements, "FP2INT");
+    } catch (const std::runtime_error &e)
+    {
+        log_error("%s", e.what());
+        return TEST_FAIL;
+    }
+
+    return TEST_PASS;
+}
diff --git a/test_conformance/basic/test_intmath.cpp b/test_conformance/basic/test_intmath.cpp
index 6fd41ab..5a4e9c2 100644
--- a/test_conformance/basic/test_intmath.cpp
+++ b/test_conformance/basic/test_intmath.cpp
@@ -123,7 +123,7 @@
     size_t datasize = sizeof(T) * num_elements * N;
 
     // Create device buffers.
-    for (int i = 0; i < ARRAY_SIZE(streams); i++)
+    for (size_t i = 0; i < ARRAY_SIZE(streams); i++)
     {
         streams[i] =
             clCreateBuffer(context, CL_MEM_READ_WRITE, datasize, NULL, &err);
@@ -175,7 +175,7 @@
         test_error(err, "clEnqueueReadBuffer failed\n");
 
         // Verify results
-        for (int i = 0; i < num_elements * N; i++)
+        for (unsigned i = 0; i < num_elements * N; i++)
         {
             T r = test.ref(inputA[i], inputB[i], inputC[i]);
             if (r != output[i])
diff --git a/test_conformance/basic/test_loop.cpp b/test_conformance/basic/test_loop.cpp
index 1a91d9e..1c9acd1 100644
--- a/test_conformance/basic/test_loop.cpp
+++ b/test_conformance/basic/test_loop.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -21,45 +21,45 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <vector>
 
 #include "procs.h"
 
-const char *loop_kernel_code =
-"__kernel void test_loop(__global int *src, __global int *loopindx, __global int *loopcnt, __global int *dst)\n"
-"{\n"
-"    int  tid = get_global_id(0);\n"
-"    int  n = get_global_size(0);\n"
-"    int  i, j;\n"
-"\n"
-"    dst[tid] = 0;\n"
-"    for (i=0,j=loopindx[tid]; i<loopcnt[tid]; i++,j++)\n"
-"    {\n"
-"        if (j >= n)\n"
-"            j = 0;\n"
-"        dst[tid] += src[j];\n"
-"    }\n"
-"\n"
-"}\n";
-
-
-int
-verify_loop(int *inptr, int *loopindx, int *loopcnt, int *outptr, int n)
+namespace {
+const char *loop_kernel_code = R"(
+__kernel void test_loop(__global int *src, __global int *loopindx, __global int *loopcnt, __global int *dst)
 {
-    int     r, i, j, k;
+    int  tid = get_global_id(0);
+    int  n = get_global_size(0);
+    int  i, j;
 
-    for (i=0; i<n; i++)
+    dst[tid] = 0;
+    for (i=0, j=loopindx[tid]; i<loopcnt[tid]; i++, j++)
     {
-        r = 0;
-        for (j=0,k=loopindx[i]; j<loopcnt[i]; j++,k++)
+        if (j >= n)
+            j = 0;
+        dst[tid] += src[j];
+    }
+}
+)";
+
+
+int verify_loop(std::vector<cl_int> inptr, std::vector<cl_int> loopindx,
+                std::vector<cl_int> loopcnt, std::vector<cl_int> outptr, int n)
+{
+    for (int i = 0; i < n; i++)
+    {
+        int r = 0;
+        for (int j = 0, k = loopindx[i]; j < loopcnt[i]; j++, k++)
         {
-            if (k >= n)
-                k = 0;
+            if (k >= n) k = 0;
             r += inptr[k];
         }
 
         if (r != outptr[i])
         {
-            log_error("LOOP test failed: %d found, expected %d\n", outptr[i], r);
+            log_error("LOOP test failed: %d found, expected %d\n", outptr[i],
+                      r);
             return -1;
         }
     }
@@ -67,119 +67,69 @@
     log_info("LOOP test passed\n");
     return 0;
 }
-
-int test_loop(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+}
+int test_loop(cl_device_id device, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    cl_mem streams[4];
-    cl_int *input_ptr, *loop_indx, *loop_cnt, *output_ptr;
-    cl_program program;
-    cl_kernel kernel;
-    size_t threads[1];
-    int err, i;
+    clMemWrapper streams[4];
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    int err;
 
     size_t length = sizeof(cl_int) * num_elements;
-    input_ptr  = (cl_int*)malloc(length);
-    loop_indx  = (cl_int*)malloc(length);
-    loop_cnt   = (cl_int*)malloc(length);
-    output_ptr = (cl_int*)malloc(length);
+    std::vector<cl_int> input(length);
+    std::vector<cl_int> loop_indx(length);
+    std::vector<cl_int> loop_cnt(length);
+    std::vector<cl_int> output(length);
 
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, length, NULL, NULL);
-    if (!streams[0])
+    for (auto &stream : streams)
     {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, length, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[2] = clCreateBuffer(context, CL_MEM_READ_WRITE, length, NULL, NULL);
-    if (!streams[2])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[3] = clCreateBuffer(context, CL_MEM_READ_WRITE, length, NULL, NULL);
-    if (!streams[3])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
+        stream =
+            clCreateBuffer(context, CL_MEM_READ_WRITE, length, nullptr, &err);
+        test_error(err, "clCreateBuffer failed.");
     }
 
-    MTdata d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
+    RandomSeed seed(gRandomSeed);
+    for (int i = 0; i < num_elements; i++)
     {
-        input_ptr[i] = (int)genrand_int32(d);
-        loop_indx[i] = (int)get_random_float(0, num_elements-1, d);
-        loop_cnt[i] = (int)get_random_float(0, num_elements/32, d);
-    }
-    free_mtdata(d); d = NULL;
+        input[i] = static_cast<int>(genrand_int32(seed));
+        loop_indx[i] =
+            static_cast<int>(get_random_float(0, num_elements - 1, seed));
+        loop_cnt[i] =
+            static_cast<int>(get_random_float(0, num_elements / 32, seed));
+    };
 
-  err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, length, input_ptr, 0, NULL, NULL);
-  if (err != CL_SUCCESS)
-  {
-    log_error("clEnqueueWriteBuffer failed\n");
-    return -1;
-  }
-  err = clEnqueueWriteBuffer(queue, streams[1], CL_TRUE, 0, length, loop_indx, 0, NULL, NULL);
-  if (err != CL_SUCCESS)
-  {
-    log_error("clEnqueueWriteBuffer failed\n");
-    return -1;
-  }
-  err = clEnqueueWriteBuffer(queue, streams[2], CL_TRUE, 0, length, loop_cnt, 0, NULL, NULL);
-  if (err != CL_SUCCESS)
-  {
-    log_error("clEnqueueWriteBuffer failed\n");
-    return -1;
-  }
+    err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, length,
+                               input.data(), 0, nullptr, nullptr);
+    test_error(err, "clEnqueueWriteBuffer failed.");
+    err = clEnqueueWriteBuffer(queue, streams[1], CL_TRUE, 0, length,
+                               loop_indx.data(), 0, nullptr, nullptr);
+    test_error(err, "clEnqueueWriteBuffer failed.");
+    err = clEnqueueWriteBuffer(queue, streams[2], CL_TRUE, 0, length,
+                               loop_cnt.data(), 0, nullptr, nullptr);
+    test_error(err, "clEnqueueWriteBuffer failed.");
 
-  err = create_single_kernel_helper(context, &program, &kernel, 1, &loop_kernel_code, "test_loop" );
-  if (err)
-    return -1;
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &loop_kernel_code, "test_loop");
+    test_error(err, "create_single_kernel_helper failed.");
 
-  err  = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
-  err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1]);
-  err |= clSetKernelArg(kernel, 2, sizeof streams[2], &streams[2]);
-  err |= clSetKernelArg(kernel, 3, sizeof streams[3], &streams[3]);
-    if (err != CL_SUCCESS)
+    for (int i = 0; i < ARRAY_SIZE(streams); i++)
     {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
+        err = clSetKernelArg(kernel, i, sizeof streams[i], &streams[i]);
+        test_error(err, "clSetKernelArgs failed\n");
     }
 
-    threads[0] = (unsigned int)num_elements;
-  err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, NULL, 0, NULL, NULL);
-  if (err != CL_SUCCESS)
-  {
-    log_error("clEnqueueNDRangeKernel failed\n");
-    return -1;
-  }
+    size_t threads[] = { (size_t)num_elements };
+    err = clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, threads, nullptr, 0,
+                                 nullptr, nullptr);
+    test_error(err, "clEnqueueNDRangeKernel failed\n");
 
-  err = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0, length, output_ptr, 0, NULL, NULL);
-  if (err != CL_SUCCESS)
-  {
-    log_error("clReadArray failed\n");
-    return -1;
-  }
+    err = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0, length,
+                              output.data(), 0, nullptr, nullptr);
+    test_error(err, "clEnqueueReadBuffer failed\n");
 
-  err = verify_loop(input_ptr, loop_indx, loop_cnt, output_ptr, num_elements);
+    err = verify_loop(input, loop_indx, loop_cnt, output, num_elements);
 
-    // cleanup
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseMemObject(streams[2]);
-    clReleaseMemObject(streams[3]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr);
-    free(loop_indx);
-    free(loop_cnt);
-    free(output_ptr);
 
     return err;
 }
-
-
diff --git a/test_conformance/basic/test_progvar.cpp b/test_conformance/basic/test_progvar.cpp
index a46713e..41cc019 100644
--- a/test_conformance/basic/test_progvar.cpp
+++ b/test_conformance/basic/test_progvar.cpp
@@ -581,13 +581,19 @@
         cl_uint max_dim = 0;
         status = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
                                  sizeof(max_dim), &max_dim, 0);
-        assert(status == CL_SUCCESS);
+        if (check_error(status,
+                        "clGetDeviceInfo for "
+                        "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed."))
+            return;
         assert(max_dim > 0);
         size_t max_id[3];
         max_id[0] = 0;
         status = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES,
                                  max_dim * sizeof(size_t), &max_id[0], 0);
-        assert(status == CL_SUCCESS);
+        if (check_error(status,
+                        "clGetDeviceInfo for "
+                        "CL_DEVICE_MAX_WORK_ITEM_SIZES failed."))
+            return;
         l_max_global_id0 = max_id[0];
     }
 
@@ -597,7 +603,10 @@
         status =
             clGetDeviceInfo(device, CL_DEVICE_LINKER_AVAILABLE,
                             sizeof(l_linker_available), &l_linker_available, 0);
-        assert(status == CL_SUCCESS);
+        if (check_error(status,
+                        "clGetDeviceInfo for "
+                        "CL_DEVICE_LINKER_AVAILABLE failed."))
+            return;
     }
 }
 
@@ -903,6 +912,7 @@
                                vol, tn, vol, tn, vol, tn, vol, tn);
     }
     assert(num_printed < sizeof(decls));
+    (void)num_printed;
     return std::string(decls);
 }
 
@@ -983,6 +993,7 @@
                                writer_template_atomic, ti.get_buf_elem_type());
     }
     assert(num_printed < sizeof(writer_src));
+    (void)num_printed;
     std::string result = writer_src;
     return result;
 }
@@ -1024,6 +1035,7 @@
                      ti.get_buf_elem_type(), ti.get_buf_elem_type());
     }
     assert(num_printed < sizeof(reader_src));
+    (void)num_printed;
     std::string result = reader_src;
     return result;
 }
diff --git a/test_conformance/basic/test_vec_type_hint.cpp b/test_conformance/basic/test_vec_type_hint.cpp
index 33168b1..0ba105d 100644
--- a/test_conformance/basic/test_vec_type_hint.cpp
+++ b/test_conformance/basic/test_vec_type_hint.cpp
@@ -13,28 +13,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "harness/compat.h"
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <sys/stat.h>
-
+#include <vector>
 
 #include "procs.h"
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
 
-
 static const char *sample_kernel = {
-  "%s\n" // optional pragma string
-  "__kernel __attribute__((vec_type_hint(%s%s))) void sample_test(__global int *src, __global int *dst)\n"
-  "{\n"
-  "    int  tid = get_global_id(0);\n"
-  "     dst[tid] = src[tid];\n"
-  "\n"
-  "}\n"
+    "%s\n"
+    "__kernel __attribute__((vec_type_hint(%s%s))) void sample_test(__global "
+    "int *src, __global int *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "     dst[tid] = src[tid];\n"
+    "\n"
+    "}\n"
 };
 
 int test_vec_type_hint(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
@@ -42,66 +41,85 @@
   int error;
   int vec_type_index, vec_size_index;
 
-  ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble };
-    const char *size_names[] = {"", "2", "4", "8", "16"};
-    char *program_source;
+  ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt,   kUInt,
+                             kLong, kULong, kFloat, kHalf,   kDouble };
+  const char *size_names[] = { "", "2", "4", "8", "16" };
+  std::vector<char> program_source(4096);
 
-  program_source = (char*)malloc(sizeof(char)*4096);
+  for (vec_type_index = 0;
+       vec_type_index < sizeof(vecType) / sizeof(vecType[0]); vec_type_index++)
+  {
 
-  for (vec_type_index=0; vec_type_index<10; vec_type_index++) {
-    if (vecType[vec_type_index] == kDouble) {
-      if (!is_extension_available(deviceID, "cl_khr_fp64")) {
-        log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
-        continue;
+      if (vecType[vec_type_index] == kHalf
+          && !is_extension_available(deviceID, "cl_khr_fp16"))
+      {
+          log_info(
+              "Extension cl_khr_fp16 not supported; skipping half tests.\n");
+          continue;
       }
-      log_info("Testing doubles.\n");
-    }
+      else if (vecType[vec_type_index] == kDouble
+               && !is_extension_available(deviceID, "cl_khr_fp64"))
+      {
+          log_info(
+              "Extension cl_khr_fp64 not supported; skipping double tests.\n");
+          continue;
+      }
+      else if ((vecType[vec_type_index] == kLong
+                || vecType[vec_type_index] == kULong)
+               && !gHasLong)
+      {
+          log_info(
+              "Extension cl_khr_int64 not supported; skipping long tests.\n");
+          continue;
+      }
 
-    if (vecType[vec_type_index] == kLong || vecType[vec_type_index] == kULong)
-    {
-        if (!gHasLong)
-        {
-            log_info("Extension cl_khr_int64 not supported; skipping long tests.\n");
-            continue;
-        }
-    }
+      for (vec_size_index = 0; vec_size_index < 5; vec_size_index++)
+      {
+          clProgramWrapper program;
+          clKernelWrapper kernel;
+          clMemWrapper in, out;
+          size_t global[] = { 1, 1, 1 };
 
-    for (vec_size_index=0; vec_size_index<5; vec_size_index++) {
-      clProgramWrapper program;
-      clKernelWrapper kernel;
-      clMemWrapper in, out;
-      size_t global[] = {1,1,1};
+          log_info("Testing __attribute__((vec_type_hint(%s%s))...\n",
+                   get_explicit_type_name(vecType[vec_type_index]),
+                   size_names[vec_size_index]);
+          char extension[128] = { 0 };
+          if (vecType[vec_type_index] == kDouble)
+              std::snprintf(extension, sizeof(extension),
+                            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable");
+          else if (vecType[vec_type_index] == kHalf)
+              std::snprintf(extension, sizeof(extension),
+                            "#pragma OPENCL EXTENSION cl_khr_fp16 : enable");
 
-      log_info("Testing __attribute__((vec_type_hint(%s%s))...\n", get_explicit_type_name(vecType[vec_type_index]), size_names[vec_size_index]);
+          sprintf(program_source.data(), sample_kernel, extension,
+                  get_explicit_type_name(vecType[vec_type_index]),
+                  size_names[vec_size_index]);
 
-      program_source[0] = '\0';
-      sprintf(program_source, sample_kernel,
-              (vecType[vec_type_index] == kDouble) ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
-              get_explicit_type_name(vecType[vec_type_index]), size_names[vec_size_index]);
+          const char *src = &program_source.front();
+          error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                              &src, "sample_test");
+          test_error(error, "create_single_kernel_helper failed");
 
-      error = create_single_kernel_helper( context, &program, &kernel, 1, (const char**)&program_source, "sample_test" );
-      if( error != 0 )
-        return error;
+          in = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int) * 10,
+                              NULL, &error);
+          test_error(error, "clCreateBuffer failed");
+          out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_int) * 10,
+                               NULL, &error);
+          test_error(error, "clCreateBuffer failed");
 
-      in = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int)*10, NULL, &error);
-      test_error(error, "clCreateBuffer failed");
-      out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_int)*10, NULL, &error);
-      test_error(error, "clCreateBuffer failed");
+          error = clSetKernelArg(kernel, 0, sizeof(in), &in);
+          test_error(error, "clSetKernelArg failed");
+          error = clSetKernelArg(kernel, 1, sizeof(out), &out);
+          test_error(error, "clSetKernelArg failed");
 
-      error = clSetKernelArg(kernel, 0, sizeof(in), &in);
-      test_error(error, "clSetKernelArg failed");
-      error = clSetKernelArg(kernel, 1, sizeof(out), &out);
-      test_error(error, "clSetKernelArg failed");
+          error = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, NULL,
+                                         0, NULL, NULL);
+          test_error(error, "clEnqueueNDRangeKernel failed");
 
-      error = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, NULL, 0, NULL, NULL);
-      test_error(error, "clEnqueueNDRangeKernel failed");
-
-      error = clFinish(queue);
-      test_error(error, "clFinish failed");
-    }
+          error = clFinish(queue);
+          test_error(error, "clFinish failed");
+      }
   }
 
-  free(program_source);
-
   return 0;
 }
diff --git a/test_conformance/basic/test_vector_creation.cpp b/test_conformance/basic/test_vector_creation.cpp
index d9530b4..6bae156 100644
--- a/test_conformance/basic/test_vector_creation.cpp
+++ b/test_conformance/basic/test_vector_creation.cpp
@@ -1,6 +1,6 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
+// Copyright (c) 2023 The Khronos Group Inc.
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -17,48 +17,41 @@
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
 #include "harness/errorHelpers.h"
+#include <vector>
 
-
-
+#include <CL/cl_half.h>
 
 #define DEBUG 0
 #define DEPTH 16
 // Limit the maximum code size for any given kernel.
-#define MAX_CODE_SIZE (1024*32)
+#define MAX_CODE_SIZE (1024 * 32)
 
-const int sizes[] = {1, 2, 3, 4, 8, 16, -1, -1, -1, -1};
-const char *size_names[] = {"", "2", "3", "4", "8", "16" , "!!a", "!!b", "!!c", "!!d"};
+static const int sizes[] = { 1, 2, 3, 4, 8, 16, -1, -1, -1, -1 };
+static const int initial_no_sizes[] = { 0, 0, 0, 0, 0, 0, 2 };
+static const char *size_names[] = { "",   "2",   "3",   "4",   "8",
+                                    "16", "!!a", "!!b", "!!c", "!!d" };
+static char extension[128] = { 0 };
 
-// Creates a kernel by enumerating all possible ways of building the vector out of vloads
-// skip_to_results will skip results up to a given number. If the amount of code generated
-// is greater than MAX_CODE_SIZE, this function will return the number of results used,
-// which can then be used as the skip_to_result value to continue where it left off.
-int create_kernel(ExplicitType type, int output_size, char *program, int *number_of_results, int skip_to_result) {
+// Creates a kernel by enumerating all possible ways of building the vector out
+// of vloads skip_to_results will skip results up to a given number. If the
+// amount of code generated is greater than MAX_CODE_SIZE, this function will
+// return the number of results used, which can then be used as the
+// skip_to_result value to continue where it left off.
+int create_kernel(ExplicitType type, int output_size, char *program,
+                  int *number_of_results, int skip_to_result)
+{
 
     int number_of_sizes;
 
-    switch (output_size) {
-        case 1:
-            number_of_sizes = 1;
-            break;
-        case 2:
-            number_of_sizes = 2;
-            break;
-        case 3:
-            number_of_sizes = 3;
-            break;
-        case 4:
-            number_of_sizes = 4;
-            break;
-        case 8:
-            number_of_sizes = 5;
-            break;
-        case 16:
-            number_of_sizes = 6;
-            break;
-        default:
-            log_error("Invalid size: %d\n", output_size);
-            return -1;
+    switch (output_size)
+    {
+        case 1: number_of_sizes = 1; break;
+        case 2: number_of_sizes = 2; break;
+        case 3: number_of_sizes = 3; break;
+        case 4: number_of_sizes = 4; break;
+        case 8: number_of_sizes = 5; break;
+        case 16: number_of_sizes = 6; break;
+        default: log_error("Invalid size: %d\n", output_size); return -1;
     }
 
     int total_results = 0;
@@ -67,102 +60,125 @@
     int total_program_length = 0;
     int aborted_due_to_size = 0;
 
-    if (skip_to_result < 0)
-        skip_to_result = 0;
+    if (skip_to_result < 0) skip_to_result = 0;
 
     // The line of code for the vector creation
     char line[1024];
-    // Keep track of what size vector we are using in each position so we can iterate through all fo them
+    // Keep track of what size vector we are using in each position so we can
+    // iterate through all fo them
     int pos[DEPTH];
     int max_size = output_size;
     if (DEBUG > 1) log_info("max_size: %d\n", max_size);
 
     program[0] = '\0';
-    sprintf(program, "%s\n__kernel void test_vector_creation(__global %s *src, __global %s%s *result) {\n",
-            type == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
-            get_explicit_type_name(type), get_explicit_type_name(type), ( number_of_sizes == 3 ) ? "" : size_names[number_of_sizes-1]);
+    sprintf(program,
+            "%s\n__kernel void test_vector_creation(__global %s *src, __global "
+            "%s%s *result) {\n",
+            extension, get_explicit_type_name(type),
+            get_explicit_type_name(type),
+            (number_of_sizes == 3) ? "" : size_names[number_of_sizes - 1]);
     total_program_length += (int)strlen(program);
 
-    char storePrefix[ 128 ], storeSuffix[ 128 ];
+    char storePrefix[128], storeSuffix[128];
 
-    // Start out trying sizes 1,1,1,1,1...
-    for (int i=0; i<DEPTH; i++)
-        pos[i] = 0;
+    // Start out trying sizes 1,1,1... by initializing pos array to zeros for
+    // all vector sizes except 16. For 16-sizes initial_no_sizes array holds
+    // factor to omit time consuming, similar creation cases tested earlier.
+    for (int i = 0; i < DEPTH; i++) pos[i] = initial_no_sizes[number_of_sizes];
 
     int done = 0;
-    while (!done) {
-        if (DEBUG > 1) {
+    while (!done)
+    {
+        if (DEBUG > 1)
+        {
             log_info("pos size[] = [");
-            for (int k=0; k<DEPTH; k++)
-                log_info(" %d ", pos[k]);
+            for (int k = 0; k < DEPTH; k++) log_info(" %d ", pos[k]);
             log_info("]\n");
         }
 
-        // Go through the selected vector sizes and see if the first n of them fit the
+        // Go through the selected vector sizes and see if the first n of them
+        // fit the
         //  required size exactly.
         int size_so_far = 0;
         int vloads;
-        for ( vloads=0; vloads<DEPTH; vloads++) {
-            if (size_so_far + sizes[pos[vloads]] <= max_size) {
+        for (vloads = 0; vloads < DEPTH; vloads++)
+        {
+            if (size_so_far + sizes[pos[vloads]] <= max_size)
+            {
                 size_so_far += sizes[pos[vloads]];
-            } else {
+            }
+            else
+            {
                 break;
             }
         }
-        if (DEBUG > 1)  log_info("vloads: %d, size_so_far:%d\n", vloads, size_so_far);
+        if (DEBUG > 1)
+            log_info("vloads: %d, size_so_far:%d\n", vloads, size_so_far);
 
-        // If they did not fit the required size exactly it is too long, so there is no point in checking any other combinations
+        // If they did not fit the required size exactly it is too long, so
+        // there is no point in checking any other combinations
         //  of the sizes to the right. Prune them from the search.
-        if (size_so_far != max_size) {
+        if (size_so_far != max_size)
+        {
             // Zero all the sizes to the right
-            for (int k=vloads+1; k<DEPTH; k++) {
+            for (int k = vloads + 1; k < DEPTH; k++)
+            {
                 pos[k] = 0;
             }
             // Increment this current size and propagate the values up if needed
-            for (int d=vloads; d>=0; d--) {
+            for (int d = vloads; d >= 0; d--)
+            {
                 pos[d]++;
-                if (pos[d] >= number_of_sizes) {
+                if (pos[d] >= number_of_sizes)
+                {
                     pos[d] = 0;
-                    if (d == 0) {
+                    if (d == 0)
+                    {
                         // If we rolled over then we are done
                         done = 1;
                         break;
                     }
-                } else {
+                }
+                else
+                {
                     break;
                 }
             }
-            // Go on to the next size since this one (and all others "under" it) didn't fit
+            // Go on to the next size since this one (and all others "under" it)
+            // didn't fit
             continue;
         }
 
 
         // Generate the actual load line if we are building this part
-        line[0]= '\0';
-        if (skip_to_result == 0 || total_results >= skip_to_result) {
-            if( number_of_sizes == 3 )
+        line[0] = '\0';
+        if (skip_to_result == 0 || total_results >= skip_to_result)
+        {
+            if (number_of_sizes == 3)
             {
-                sprintf( storePrefix, "vstore3( " );
-                sprintf( storeSuffix, ", %d, result )", current_result );
+                sprintf(storePrefix, "vstore3( ");
+                sprintf(storeSuffix, ", %d, result )", current_result);
             }
             else
             {
-                sprintf( storePrefix, "result[%d] = ", current_result );
-                storeSuffix[ 0 ] = 0;
+                sprintf(storePrefix, "result[%d] = ", current_result);
+                storeSuffix[0] = 0;
             }
 
-            sprintf(line, "\t%s(%s%d)(", storePrefix, get_explicit_type_name(type), output_size);
+            sprintf(line, "\t%s(%s%d)(", storePrefix,
+                    get_explicit_type_name(type), output_size);
             current_result++;
 
             int offset = 0;
-            for (int i=0; i<vloads; i++) {
+            for (int i = 0; i < vloads; i++)
+            {
                 if (pos[i] == 0)
                     sprintf(line + strlen(line), "src[%d]", offset);
                 else
-                    sprintf(line + strlen(line), "vload%s(0,src+%d)", size_names[pos[i]], offset);
+                    sprintf(line + strlen(line), "vload%s(0,src+%d)",
+                            size_names[pos[i]], offset);
                 offset += sizes[pos[i]];
-                if (i<(vloads-1))
-                    sprintf(line + strlen(line), ",");
+                if (i < (vloads - 1)) sprintf(line + strlen(line), ",");
             }
             sprintf(line + strlen(line), ")%s;\n", storeSuffix);
 
@@ -171,7 +187,8 @@
         }
         total_results++;
         total_program_length += (int)strlen(line);
-        if (total_program_length > MAX_CODE_SIZE) {
+        if (total_program_length > MAX_CODE_SIZE)
+        {
             aborted_due_to_size = 1;
             done = 1;
         }
@@ -179,132 +196,194 @@
 
         if (DEBUG) log_info("line is: %s", line);
 
-        // If we did not use all of them, then we ignore any changes further to the right.
-        // We do this by causing those loops to skip on the next iteration.
-        if (vloads < DEPTH) {
+        // If we did not use all of them, then we ignore any changes further to
+        // the right. We do this by causing those loops to skip on the next
+        // iteration.
+        if (vloads < DEPTH)
+        {
             if (DEBUG > 1) log_info("done with this depth\n");
-            for (int k=vloads; k<DEPTH; k++)
-                pos[k] = number_of_sizes;
+            for (int k = vloads; k < DEPTH; k++) pos[k] = number_of_sizes;
         }
 
         // Increment the far right size by 1, rolling over as needed
-        for (int d=DEPTH-1; d>=0; d--) {
+        for (int d = DEPTH - 1; d >= 0; d--)
+        {
             pos[d]++;
-            if (pos[d] >= number_of_sizes) {
+            if (pos[d] >= number_of_sizes)
+            {
                 pos[d] = 0;
-                if (d == 0) {
+                if (d == 0)
+                {
                     // If we rolled over at the far-left then we are done
                     done = 1;
                     break;
                 }
-            } else {
+            }
+            else
+            {
                 break;
             }
         }
-        if (done)
-            break;
+        if (done) break;
 
         // Continue until we are done.
     }
-    strcat(program, "}\n\n"); //log_info("%s\n", program);
+    strcat(program, "}\n\n"); // log_info("%s\n", program);
     total_program_length += 3;
-    if (DEBUG) log_info("\t\t(Program for vector type %s%s contains %d vector creations, of total program length %gkB, with a total of %d vloads.)\n",
-                        get_explicit_type_name(type), size_names[number_of_sizes-1], total_results, total_program_length/1024.0, total_vloads);
+    if (DEBUG)
+        log_info(
+            "\t\t(Program for vector type %s%s contains %d vector creations, "
+            "of total program length %gkB, with a total of %d vloads.)\n",
+            get_explicit_type_name(type), size_names[number_of_sizes - 1],
+            total_results, total_program_length / 1024.0, total_vloads);
     *number_of_results = current_result;
-    if (aborted_due_to_size)
-        return total_results;
+    if (aborted_due_to_size) return total_results;
     return 0;
 }
 
 
-
-
-int test_vector_creation(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_vector_creation(cl_device_id deviceID, cl_context context,
+                         cl_command_queue queue, int num_elements)
 {
-    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble };
-    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16};
+    const std::vector<ExplicitType> vecType = { kChar,  kUChar, kShort, kUShort,
+                                                kInt,   kUInt,  kLong,  kULong,
+                                                kFloat, kHalf,  kDouble };
+    // should be in sync with global array size_names
+    const std::vector<unsigned int> vecSizes = { 1, 2, 3, 4, 8, 16 };
 
-    char *program_source;
-    int error;
+    int error = CL_SUCCESS;
     int total_errors = 0;
+    int number_of_results = 0;
 
-    cl_int input_data_int[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-    cl_double input_data_double[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-    void *input_data_converted;
-    void *output_data;
-
-    int number_of_results;;
-
-    input_data_converted = malloc(sizeof(cl_double)*16);
-    program_source = (char*)malloc(sizeof(char)*1024*1024*4);
+    std::vector<char> input_data_converted(sizeof(cl_double) * 16);
+    std::vector<char> program_source(sizeof(char) * 1024 * 1024 * 4);
+    std::vector<char> output_data;
 
     // Iterate over all the types
-    for (int type_index=0; type_index<10; type_index++) {
-    if(!gHasLong && ((vecType[type_index] == kLong)  || (vecType[type_index] == kULong)))
+    for (size_t type_index = 0; type_index < vecType.size(); type_index++)
     {
-      log_info("Long/ULong data type not supported on this device\n");
-      continue;
-    }
 
-        clMemWrapper input;
-
-        if (vecType[type_index] == kDouble) {
-            if (!is_extension_available(deviceID, "cl_khr_fp64")) {
-                log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
+        if (!gHasLong
+            && ((vecType[type_index] == kLong)
+                || (vecType[type_index] == kULong)))
+        {
+            log_info("Long/ULong data type not supported on this device\n");
+            continue;
+        }
+        else if (vecType[type_index] == kDouble)
+        {
+            if (!is_extension_available(deviceID, "cl_khr_fp64"))
+            {
+                log_info("Extension cl_khr_fp64 not supported; skipping double "
+                         "tests.\n");
                 continue;
             }
-            log_info("Testing doubles.\n");
+            snprintf(extension, sizeof(extension), "%s",
+                     "#pragma OPENCL EXTENSION cl_khr_fp64 : enable");
         }
+        else if (vecType[type_index] == kHalf)
+        {
+            if (!is_extension_available(deviceID, "cl_khr_fp16"))
+            {
+                log_info("Extension cl_khr_fp16 not supported; skipping half "
+                         "tests.\n");
+                continue;
+            }
+            snprintf(extension, sizeof(extension), "%s",
+                     "#pragma OPENCL EXTENSION cl_khr_fp16 : enable");
+        }
+
+        log_info("Testing %s.\n", get_explicit_type_name(vecType[type_index]));
 
         // Convert the data to the right format for the test.
-        memset(input_data_converted, 0xff, sizeof(cl_double)*16);
-        if (vecType[type_index] != kDouble) {
-            for (int j=0; j<16; j++) {
-                convert_explicit_value(&input_data_int[j], ((char*)input_data_converted)+get_explicit_type_size(vecType[type_index])*j,
-                                       kInt, 0, kRoundToEven, vecType[type_index]);
+        memset(input_data_converted.data(), 0xff, sizeof(cl_double) * 16);
+        if (vecType[type_index] == kDouble)
+        {
+            const cl_double input_data_double[16] = { 0,  1,  2,  3, 4,  5,
+                                                      6,  7,  8,  9, 10, 11,
+                                                      12, 13, 14, 15 };
+            memcpy(input_data_converted.data(), &input_data_double,
+                   sizeof(cl_double) * 16);
+        }
+        else if (vecType[type_index] == kHalf)
+        {
+            cl_half *buf =
+                reinterpret_cast<cl_half *>(input_data_converted.data());
+            for (int j = 0; j < 16; j++)
+                buf[j] = cl_half_from_float(float(j), CL_HALF_RTE);
+        }
+        else
+        {
+            for (int j = 0; j < 16; j++)
+            {
+                convert_explicit_value(
+                    &j,
+                    ((char *)input_data_converted.data())
+                        + get_explicit_type_size(vecType[type_index]) * j,
+                    kInt, 0, kRoundToEven, vecType[type_index]);
             }
-        } else {
-            memcpy(input_data_converted, &input_data_double, sizeof(cl_double)*16);
         }
 
-        input = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, get_explicit_type_size(vecType[type_index])*16,
-                               (vecType[type_index] != kDouble) ? input_data_converted : input_data_double, &error);
-        if (error) {
+        clMemWrapper input =
+            clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                           get_explicit_type_size(vecType[type_index]) * 16,
+                           input_data_converted.data(), &error);
+        if (error)
+        {
             print_error(error, "clCreateBuffer failed");
             total_errors++;
             continue;
         }
 
         // Iterate over all the vector sizes.
-        for (int size_index=1; size_index< 5; size_index++) {
-            size_t global[] = {1,1,1};
+        for (size_t size_index = 1; size_index < vecSizes.size(); size_index++)
+        {
+            size_t global[] = { 1, 1, 1 };
             int number_generated = -1;
             int previous_number_generated = 0;
 
-            log_info("Testing %s%s...\n", get_explicit_type_name(vecType[type_index]), size_names[size_index]);
-            while (number_generated != 0) {
+            log_info("Testing %s%s...\n",
+                     get_explicit_type_name(vecType[type_index]),
+                     size_names[size_index]);
+            while (number_generated != 0)
+            {
                 clMemWrapper output;
                 clKernelWrapper kernel;
                 clProgramWrapper program;
 
-                number_generated = create_kernel(vecType[type_index], vecSizes[size_index], program_source, &number_of_results, number_generated);
-                if (number_generated != 0) {
+                number_generated =
+                    create_kernel(vecType[type_index], vecSizes[size_index],
+                                  program_source.data(), &number_of_results,
+                                  number_generated);
+                if (number_generated != 0)
+                {
                     if (previous_number_generated == 0)
-                        log_info("Code size greater than %gkB; splitting test into multiple kernels.\n", MAX_CODE_SIZE/1024.0);
-                    log_info("\tExecuting vector permutations %d to %d...\n", previous_number_generated, number_generated-1);
+                        log_info("Code size greater than %gkB; splitting test "
+                                 "into multiple kernels.\n",
+                                 MAX_CODE_SIZE / 1024.0);
+                    log_info("\tExecuting vector permutations %d to %d...\n",
+                             previous_number_generated, number_generated - 1);
                 }
 
-                error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&program_source, "test_vector_creation");
-                if (error) {
+                char *src = program_source.data();
+                error = create_single_kernel_helper(context, &program, &kernel,
+                                                    1, (const char **)&src,
+                                                    "test_vector_creation");
+                if (error)
+                {
                     log_error("create_single_kernel_helper failed.\n");
                     total_errors++;
                     break;
                 }
 
-                output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
-                                        number_of_results*get_explicit_type_size(vecType[type_index])*vecSizes[size_index],
-                                        NULL, &error);
-                if (error) {
+                output = clCreateBuffer(
+                    context, CL_MEM_WRITE_ONLY,
+                    number_of_results
+                        * get_explicit_type_size(vecType[type_index])
+                        * vecSizes[size_index],
+                    NULL, &error);
+                if (error)
+                {
                     print_error(error, "clCreateBuffer failed");
                     total_errors++;
                     break;
@@ -312,95 +391,115 @@
 
                 error = clSetKernelArg(kernel, 0, sizeof(input), &input);
                 error |= clSetKernelArg(kernel, 1, sizeof(output), &output);
-                if (error) {
+                if (error)
+                {
                     print_error(error, "clSetKernelArg failed");
                     total_errors++;
                     break;
                 }
 
-                error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global, NULL, 0, NULL, NULL);
-                if (error) {
+                error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global,
+                                               NULL, 0, NULL, NULL);
+                if (error)
+                {
                     print_error(error, "clEnqueueNDRangeKernel failed");
                     total_errors++;
                     break;
                 }
 
                 error = clFinish(queue);
-                if (error) {
+                if (error)
+                {
                     print_error(error, "clFinish failed");
                     total_errors++;
                     break;
                 }
 
-                output_data = malloc(number_of_results*get_explicit_type_size(vecType[type_index])*vecSizes[size_index]);
-                if (output_data == NULL) {
-                    log_error("Failed to allocate memory for output data.\n");
-                    total_errors++;
-                    break;
-                }
-                memset(output_data, 0xff, number_of_results*get_explicit_type_size(vecType[type_index])*vecSizes[size_index]);
-                error = clEnqueueReadBuffer(queue, output, CL_TRUE, 0,
-                                            number_of_results*get_explicit_type_size(vecType[type_index])*vecSizes[size_index],
-                                            output_data, 0, NULL, NULL);
-                if (error) {
+                output_data.resize(number_of_results
+                                   * get_explicit_type_size(vecType[type_index])
+                                   * vecSizes[size_index]);
+                memset(output_data.data(), 0xff,
+                       number_of_results
+                           * get_explicit_type_size(vecType[type_index])
+                           * vecSizes[size_index]);
+                error = clEnqueueReadBuffer(
+                    queue, output, CL_TRUE, 0,
+                    number_of_results
+                        * get_explicit_type_size(vecType[type_index])
+                        * vecSizes[size_index],
+                    output_data.data(), 0, NULL, NULL);
+                if (error)
+                {
                     print_error(error, "clEnqueueReadBuffer failed");
                     total_errors++;
-                    free(output_data);
                     break;
                 }
 
                 // Check the results
-                char *res = (char *)output_data;
-                char *exp = (char *)input_data_converted;
-                for (int i=0; i<number_of_results; i++) {
+                char *res = (char *)output_data.data();
+                char *exp = (char *)input_data_converted.data();
+                for (int i = 0; i < number_of_results; i++)
+                {
                     // If they do not match, then print out why
-                    if (memcmp(input_data_converted,
-                               res + i*(get_explicit_type_size(vecType[type_index])*vecSizes[size_index]),
-                               get_explicit_type_size(vecType[type_index])*vecSizes[size_index])
-                        ) {
+                    if (memcmp(exp,
+                               res
+                                   + i
+                                       * (get_explicit_type_size(
+                                              vecType[type_index])
+                                          * vecSizes[size_index]),
+                               get_explicit_type_size(vecType[type_index])
+                                   * vecSizes[size_index]))
+                    {
                         log_error("Data failed to validate for result %d\n", i);
 
-                        // Find the line in the program that failed. This is ugly.
-                        char search[32];
-                        char found_line[1024];
-                        found_line[0]='\0';
-                        search[0]='\0';
+                        // Find the line in the program that failed. This is
+                        // ugly.
+                        char search[32] = { 0 };
+                        char found_line[1024] = { 0 };
                         sprintf(search, "result[%d] = (", i);
-                        char *start_loc = strstr(program_source, search);
+                        char *start_loc = strstr(program_source.data(), search);
                         if (start_loc == NULL)
-                            log_error("Failed to find program source for failure for %s in \n%s", search, program_source);
-                        else {
-                          char *end_loc = strstr(start_loc, "\n");
-                          memcpy(&found_line, start_loc, (end_loc-start_loc));
-                          found_line[end_loc-start_loc]='\0';
-                          log_error("Failed vector line: %s\n", found_line);
+                            log_error("Failed to find program source for "
+                                      "failure for %s in \n%s",
+                                      search, program_source.data());
+                        else
+                        {
+                            char *end_loc = strstr(start_loc, "\n");
+                            memcpy(&found_line, start_loc,
+                                   (end_loc - start_loc));
+                            found_line[end_loc - start_loc] = '\0';
+                            log_error("Failed vector line: %s\n", found_line);
                         }
 
-                        for (int j=0; j<(int)vecSizes[size_index]; j++) {
-                            char expected_value[64];
-                            char returned_value[64];
-                            expected_value[0]='\0';
-                            returned_value[0]='\0';
-                            print_type_to_string(vecType[type_index], (void*)(res+get_explicit_type_size(vecType[type_index])*(i*vecSizes[size_index]+j)), returned_value);
-                            print_type_to_string(vecType[type_index], (void*)(exp+get_explicit_type_size(vecType[type_index])*j), expected_value);
-                            log_error("index [%d, component %d]: got: %s expected: %s\n", i, j,
-                                      returned_value, expected_value);
+                        for (int j = 0; j < (int)vecSizes[size_index]; j++)
+                        {
+                            char expected_value[64] = { 0 };
+                            char returned_value[64] = { 0 };
+                            print_type_to_string(
+                                vecType[type_index],
+                                (void *)(res
+                                         + get_explicit_type_size(
+                                               vecType[type_index])
+                                             * (i * vecSizes[size_index] + j)),
+                                returned_value);
+                            print_type_to_string(
+                                vecType[type_index],
+                                (void *)(exp
+                                         + get_explicit_type_size(
+                                               vecType[type_index])
+                                             * j),
+                                expected_value);
+                            log_error("index [%d, component %d]: got: %s "
+                                      "expected: %s\n",
+                                      i, j, returned_value, expected_value);
                         }
-
                         total_errors++;
                     }
                 }
-                free(output_data);
                 previous_number_generated = number_generated;
             } // number_generated != 0
-
         } // vector sizes
     } // vector types
 
-    free(input_data_converted);
-    free(program_source);
-
     return total_errors;
 }
-
-
diff --git a/test_conformance/basic/test_vector_swizzle.cpp b/test_conformance/basic/test_vector_swizzle.cpp
index 884bcf3..fdbc891 100644
--- a/test_conformance/basic/test_vector_swizzle.cpp
+++ b/test_conformance/basic/test_vector_swizzle.cpp
@@ -22,6 +22,8 @@
 #include "procs.h"
 #include "harness/testHarness.h"
 
+static std::string pragma_extension;
+
 template <int N> struct TestInfo
 {
 };
@@ -629,7 +631,9 @@
         clProgramWrapper program;
         clKernelWrapper kernel;
 
-        const char* xyzw_source = TestInfo<N>::kernel_source_xyzw;
+        std::string program_src =
+            pragma_extension + std::string(TestInfo<N>::kernel_source_xyzw);
+        const char* xyzw_source = program_src.c_str();
         error = create_single_kernel_helper(
             context, &program, &kernel, 1, &xyzw_source,
             "test_vector_swizzle_xyzw", buildOptions.c_str());
@@ -643,7 +647,9 @@
         clProgramWrapper program;
         clKernelWrapper kernel;
 
-        const char* sN_source = TestInfo<N>::kernel_source_sN;
+        std::string program_src =
+            pragma_extension + std::string(TestInfo<N>::kernel_source_sN);
+        const char* sN_source = program_src.c_str();
         error = create_single_kernel_helper(
             context, &program, &kernel, 1, &sN_source, "test_vector_swizzle_sN",
             buildOptions.c_str());
@@ -660,7 +666,9 @@
         const Version device_version = get_device_cl_version(device);
         if (device_version >= Version(3, 0))
         {
-            const char* rgba_source = TestInfo<N>::kernel_source_rgba;
+            std::string program_src =
+                pragma_extension + std::string(TestInfo<N>::kernel_source_rgba);
+            const char* rgba_source = program_src.c_str();
             error = create_single_kernel_helper(
                 context, &program, &kernel, 1, &rgba_source,
                 "test_vector_swizzle_rgba", buildOptions.c_str());
@@ -689,6 +697,7 @@
                         cl_command_queue queue, int num_elements)
 {
     int hasDouble = is_extension_available(device, "cl_khr_fp64");
+    int hasHalf = is_extension_available(device, "cl_khr_fp16");
 
     int result = TEST_PASS;
     result |= test_type<cl_char>("char", device, context, queue);
@@ -703,8 +712,14 @@
         result |= test_type<cl_ulong>("ulong", device, context, queue);
     }
     result |= test_type<cl_float>("float", device, context, queue);
+    if (hasHalf)
+    {
+        pragma_extension = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+        result |= test_type<cl_half>("half", device, context, queue);
+    }
     if (hasDouble)
     {
+        pragma_extension = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
         result |= test_type<cl_double>("double", device, context, queue);
     }
     return result;
diff --git a/test_conformance/basic/test_vloadstore.cpp b/test_conformance/basic/test_vloadstore.cpp
index e137f9e..d34ecbf 100644
--- a/test_conformance/basic/test_vloadstore.cpp
+++ b/test_conformance/basic/test_vloadstore.cpp
@@ -13,52 +13,129 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "harness/compat.h"
-
+#include <algorithm>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <vector>
 
+#include <CL/cl_half.h>
 
 #include "procs.h"
 #include "harness/conversions.h"
-#include "harness/typeWrappers.h"
 #include "harness/errorHelpers.h"
+#include "harness/stringHelpers.h"
+#include "harness/typeWrappers.h"
 
 // Outputs debug information for stores
 #define DEBUG 0
 // Forces stores/loads to be done with offsets = tid
 #define LINEAR_OFFSETS 0
 #define NUM_LOADS    512
+#define HFF(num) cl_half_from_float(num, halfRoundingMode)
+#define HTF(num) cl_half_to_float(num)
 
-static const char *doubleExtensionPragma = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+char pragma_str[128] = { 0 };
+char mem_type[64] = { 0 };
+char store_str[128] = { 0 };
+char load_str[128] = { 0 };
+
+extern cl_half_rounding_mode halfRoundingMode;
+
+// clang-format off
+static const char *store_pattern= "results[ tid ] = tmp;\n";
+static const char *store_patternV3 = "results[3*tid] = tmp.s0; results[3*tid+1] = tmp.s1; results[3*tid+2] = tmp.s2;\n";
+static const char *load_pattern = "sSharedStorage[ i ] = src[ i ];\n";
+static const char *load_patternV3 = "sSharedStorage[3*i] = src[ 3*i]; sSharedStorage[3*i+1] = src[3*i+1]; sSharedStorage[3*i+2] = src[3*i+2];\n";
+static const char *kernel_pattern[] = {
+pragma_str,
+"#define STYPE %s\n"
+"__kernel void test_fn( ", mem_type, " STYPE *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
+"{\n"
+"    int tid = get_global_id( 0 );\n"
+"    %s%d tmp = vload%d( offsets[ tid ], ( (", mem_type, " STYPE *) src ) + alignmentOffsets[ tid ] );\n"
+"    ", store_str,
+"}\n"
+};
+
+const char *pattern_local [] = {
+pragma_str,
+"__kernel void test_fn(__local %s *sSharedStorage, __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
+"{\n"
+"    int tid = get_global_id( 0 );\n"
+"   int lid = get_local_id( 0 );\n"
+"\n"
+"    if( lid == 0 )\n"
+"    {\n"
+"        for( int i = 0; i < %d; i++ ) {\n"
+"           ", load_str,
+"        }\n"
+"    }\n"
+//  Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
+//  threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
+//  updated on all threads at that point
+"   barrier( CLK_LOCAL_MEM_FENCE );\n"
+"\n"
+"    %s%d tmp = vload%d( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
+"    ", store_str,
+"}\n" };
+
+const char *pattern_priv [] = {
+pragma_str,
+// Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
+// for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
+"#define PRIV_TYPE %s\n"
+"#define PRIV_SIZE %d\n"
+"__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
+"{\n"
+"    __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
+"    int tid = get_global_id( 0 );\n"
+"\n"
+"    for( int i = 0; i < PRIV_SIZE; i++ )\n"
+"      sPrivateStorage[ i ] = src[ i ];\n"
+//    Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
+//  anybody else to sync up
+"\n"
+"    %s%d tmp = vload%d( offsets[ tid ], ( (__private %s *) sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
+"    ", store_str,
+"}\n"};
+// clang-format on
 
 #pragma mark -------------------- vload harness --------------------------
 
-typedef void (*create_vload_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize );
+typedef void (*create_program_fn)(std::string &, size_t, ExplicitType, size_t,
+                                  size_t);
+typedef int (*test_fn)(cl_device_id, cl_context, cl_command_queue, ExplicitType,
+                       unsigned int, create_program_fn, size_t);
 
-int test_vload( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize,
-               create_vload_program_fn createFn, size_t bufferSize, MTdata d )
+int test_vload(cl_device_id device, cl_context context, cl_command_queue queue,
+               ExplicitType type, unsigned int vecSize,
+               create_program_fn createFn, size_t bufferSize)
 {
-    int error;
-
     clProgramWrapper program;
     clKernelWrapper kernel;
     clMemWrapper streams[ 4 ];
+    MTdataHolder d(gRandomSeed);
     const size_t numLoads = (DEBUG) ? 16 : NUM_LOADS;
 
     if (DEBUG) bufferSize = (bufferSize < 128) ? bufferSize : 128;
 
     size_t threads[ 1 ], localThreads[ 1 ];
     clProtectedArray inBuffer( bufferSize );
-    char programSrc[ 10240 ];
     cl_uint offsets[ numLoads ], alignmentOffsets[ numLoads ];
     size_t numElements, typeSize, i;
     unsigned int outVectorSize;
 
+    pragma_str[0] = '\0';
+    if (type == kDouble)
+        std::snprintf(pragma_str, sizeof(pragma_str),
+                      "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n");
+    else if (type == kHalf)
+        std::snprintf(pragma_str, sizeof(pragma_str),
+                      "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n");
 
     typeSize = get_explicit_type_size( type );
     numElements = bufferSize / ( typeSize * vecSize );
@@ -83,25 +160,19 @@
     outVectorSize = vecSize;
 
     // Declare output buffers now
-#if !(defined(_WIN32) && defined(_MSC_VER))
-    char outBuffer[ numLoads * typeSize * outVectorSize ];
-    char referenceBuffer[ numLoads * typeSize * vecSize ];
-#else
-    char* outBuffer = (char*)_malloca(numLoads * typeSize * outVectorSize * sizeof(cl_char));
-    char* referenceBuffer = (char*)_malloca(numLoads * typeSize * vecSize * sizeof(cl_char));
-#endif
+    std::vector<char> outBuffer(numLoads * typeSize * outVectorSize);
+    std::vector<char> referenceBuffer(numLoads * typeSize * vecSize);
 
     // Create the program
-
-
+    std::string programSrc;
     createFn( programSrc, numElements, type, vecSize, outVectorSize);
 
     // Create our kernel
-    const char *ptr = programSrc;
-
-    error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" );
+    const char *ptr = programSrc.c_str();
+    cl_int error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                               &ptr, "test_fn");
     test_error( error, "Unable to create testing kernel" );
-    if (DEBUG) log_info("Kernel: \n%s\n", programSrc);
+    if (DEBUG) log_info("Kernel: \n%s\n", programSrc.c_str());
 
     // Get the number of args to differentiate the kernels with local storage. (They have 5)
     cl_uint numArgs;
@@ -115,7 +186,9 @@
     test_error( error, "Unable to create kernel stream" );
     streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(alignmentOffsets[0]), alignmentOffsets, &error );
     test_error( error, "Unable to create kernel stream" );
-    streams[ 3 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*typeSize*outVectorSize, (void *)outBuffer, &error );
+    streams[3] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                                numLoads * typeSize * outVectorSize,
+                                (void *)outBuffer.data(), &error);
     test_error( error, "Unable to create kernel stream" );
 
     // Set parameters and run
@@ -145,28 +218,32 @@
     test_error( error, "Unable to exec kernel" );
 
     // Get the results
-    error = clEnqueueReadBuffer( queue, streams[ 3 ], CL_TRUE, 0, numLoads * typeSize * outVectorSize * sizeof(cl_char), (void *)outBuffer, 0, NULL, NULL );
+    error = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0,
+                                numLoads * typeSize * outVectorSize
+                                    * sizeof(cl_char),
+                                (void *)outBuffer.data(), 0, NULL, NULL);
     test_error( error, "Unable to read results" );
 
-
     // Create the reference results
-    memset( referenceBuffer, 0, numLoads * typeSize * vecSize * sizeof(cl_char));
+    referenceBuffer.assign(numLoads * typeSize * vecSize, 0);
     for( i = 0; i < numLoads; i++ )
     {
-        memcpy( referenceBuffer + i * typeSize * vecSize, ( (char *)(void *)inBuffer ) + ( ( offsets[ i ] * vecSize ) + alignmentOffsets[ i ] ) * typeSize,
-               typeSize * vecSize );
+        memcpy(&referenceBuffer[i * typeSize * vecSize],
+               ((char *)(void *)inBuffer)
+                   + ((offsets[i] * vecSize) + alignmentOffsets[i]) * typeSize,
+               typeSize * vecSize);
     }
 
     // Validate the results now
-    char *expected = referenceBuffer;
-    char *actual = outBuffer;
+    char *expected = referenceBuffer.data();
+    char *actual = outBuffer.data();
     char *in = (char *)(void *)inBuffer;
 
     if (DEBUG) {
         log_info("Memory contents:\n");
+        char inString[1024];
+        char expectedString[1024], actualString[1024];
         for (i=0; i<numElements; i++) {
-            char  inString[1024];
-            char expectedString[ 1024 ], actualString[ 1024 ];
             if (i < numLoads) {
                 log_info("buffer %3d: input: %s expected: %s got: %s (load offset %3d, alignment offset %3d)", (int)i, GetDataVectorString( &(in[i*typeSize*vecSize]), typeSize, vecSize, inString ),
                          GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
@@ -197,35 +274,42 @@
         expected += typeSize * vecSize;
         actual += typeSize * outVectorSize;
     }
-
     return 0;
 }
 
-int test_vloadset(cl_device_id device, cl_context context, cl_command_queue queue, create_vload_program_fn createFn, size_t bufferSize )
+template <test_fn test_func_ptr>
+int test_vset(cl_device_id device, cl_context context, cl_command_queue queue,
+              create_program_fn createFn, size_t bufferSize)
 {
-    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
+    std::vector<ExplicitType> vecType = { kChar,  kUChar, kShort, kUShort,
+                                          kInt,   kUInt,  kLong,  kULong,
+                                          kFloat, kHalf,  kDouble };
     unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 };
     const char *size_names[] = { "2", "3", "4", "8", "16"};
-    unsigned int typeIdx, sizeIdx;
     int error = 0;
-    MTdata mtData = init_genrand( gRandomSeed );
 
     log_info("Testing with buffer size of %d.\n", (int)bufferSize);
 
-    for( typeIdx = 0; vecType[ typeIdx ] != kNumExplicitTypes; typeIdx++ )
+    bool hasDouble = is_extension_available(device, "cl_khr_fp64");
+    bool hasHalf = is_extension_available(device, "cl_khr_fp16");
+
+    for (unsigned typeIdx = 0; typeIdx < vecType.size(); typeIdx++)
     {
-
-        if( vecType[ typeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) )
+        if (vecType[typeIdx] == kDouble && !hasDouble)
+            continue;
+        else if (vecType[typeIdx] == kHalf && !hasHalf)
+            continue;
+        else if ((vecType[typeIdx] == kLong || vecType[typeIdx] == kULong)
+                 && !gHasLong)
             continue;
 
-        if(( vecType[ typeIdx ] == kLong || vecType[ typeIdx ] == kULong ) && !gHasLong )
-            continue;
-
-        for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ )
+        for (unsigned sizeIdx = 0; vecSizes[sizeIdx] != 0; sizeIdx++)
         {
             log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]);
 
-            int error_this_type = test_vload( device, context, queue, vecType[ typeIdx ], vecSizes[ sizeIdx ], createFn, bufferSize, mtData );
+            int error_this_type =
+                test_func_ptr(device, context, queue, vecType[typeIdx],
+                              vecSizes[sizeIdx], createFn, bufferSize);
             if (error_this_type) {
                 error += error_this_type;
                 log_error("Failure; skipping further sizes for this type.");
@@ -233,125 +317,59 @@
             }
         }
     }
-
-    free_mtdata(mtData);
-
     return error;
 }
 
 #pragma mark -------------------- vload test cases --------------------------
 
-void create_global_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
+void create_global_load_code(std::string &destBuffer, size_t inBufferSize,
+                             ExplicitType type, size_t inVectorSize,
+                             size_t outVectorSize)
 {
-    const char *pattern =
-    "%s%s"
-    "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "    %s%d tmp = vload%d( offsets[ tid ], ( (__global %s *) src ) + alignmentOffsets[ tid ] );\n"
-    "   results[ tid ] = tmp;\n"
-    "}\n";
-
-    const char *patternV3 =
-    "%s%s"
-    "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "    %s3 tmp = vload3( offsets[ tid ], ( (__global %s *) src ) + alignmentOffsets[ tid ] );\n"
-    "   results[ 3*tid ] = tmp.s0;\n"
-    "   results[ 3*tid+1 ] = tmp.s1;\n"
-    "   results[ 3*tid+2 ] = tmp.s2;\n"
-    "}\n";
-
+    std::snprintf(mem_type, sizeof(mem_type), "__global");
+    std::snprintf(store_str, sizeof(store_str), store_patternV3);
     const char *typeName = get_explicit_type_name(type);
-    if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, typeName, typeName, typeName );
-    } else {
-        sprintf( destBuffer, pattern, type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize,
-                (int)inVectorSize, typeName );
+    std::string outTypeName = typeName;
+    if (inVectorSize != 3)
+    {
+        outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
+        std::snprintf(store_str, sizeof(store_str), store_pattern);
     }
+
+    std::string kernel_src = concat_kernel(
+        kernel_pattern, sizeof(kernel_pattern) / sizeof(kernel_pattern[0]));
+    destBuffer = str_sprintf(kernel_src, typeName, outTypeName.c_str(),
+                             typeName, (int)inVectorSize, (int)inVectorSize);
 }
 
 int test_vload_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
 {
-    return test_vloadset( device, context, queue, create_global_load_code, 10240 );
+    return test_vset<test_vload>(device, context, queue,
+                                 create_global_load_code, 10240);
 }
 
-
-void create_local_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
+void create_local_load_code(std::string &destBuffer, size_t inBufferSize,
+                            ExplicitType type, size_t inVectorSize,
+                            size_t outVectorSize)
 {
-    const char *pattern =
-    "%s%s"
-    //"   __local %s%d sSharedStorage[ %d ];\n"
-    "__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "   int lid = get_local_id( 0 );\n"
-    "\n"
-    "    if( lid == 0 )\n"
-    "    {\n"
-    "        for( int i = 0; i < %d; i++ )\n"
-    "           sSharedStorage[ i ] = src[ i ];\n"
-    "    }\n"
-    //  Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
-    //  threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
-    //  updated on all threads at that point
-    "   barrier( CLK_LOCAL_MEM_FENCE );\n"
-    "\n"
-    "    %s%d tmp = vload%d( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
-    "   results[ tid ] = tmp;\n"
-    "}\n";
-
-    const char *patternV3 =
-    "%s%s"
-    //"   __local %s%d sSharedStorage[ %d ];\n"
-    "__kernel void test_fn(__local %s *sSharedStorage, __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "   int lid = get_local_id( 0 );\n"
-    "\n"
-    "    if( lid == 0 )\n"
-    "    {\n"
-    "        for( int i = 0; i < %d; i++ ) {\n"
-    "           sSharedStorage[ 3*i   ] = src[ 3*i   ];\n"
-    "           sSharedStorage[ 3*i +1] = src[ 3*i +1];\n"
-    "           sSharedStorage[ 3*i +2] = src[ 3*i +2];\n"
-    "        }\n"
-    "    }\n"
-    //  Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
-    //  threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
-    //  updated on all threads at that point
-    "   barrier( CLK_LOCAL_MEM_FENCE );\n"
-    "\n"
-    "    %s3 tmp = vload3( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
-    "   results[ 3*tid   ] = tmp.s0;\n"
-    "   results[ 3*tid +1] = tmp.s1;\n"
-    "   results[ 3*tid +2] = tmp.s2;\n"
-    "}\n";
-
+    std::snprintf(store_str, sizeof(store_str), store_patternV3);
+    std::snprintf(load_str, sizeof(load_str), load_patternV3);
     const char *typeName = get_explicit_type_name(type);
-    if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble  ? doubleExtensionPragma : "",
-                "",
-                typeName, /*(int)inBufferSize,*/
-                typeName, typeName,
-                (int)inBufferSize,
-                typeName, typeName );
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble  ? doubleExtensionPragma : "",
-                "",
-                typeName, (int)inVectorSize, /*(int)inBufferSize,*/
-                typeName, (int)inVectorSize, typeName, (int)outVectorSize,
-                (int)inBufferSize,
-                typeName, (int)inVectorSize, (int)inVectorSize, typeName );
+    std::string outTypeName = typeName;
+    std::string inTypeName = typeName;
+    if (inVectorSize != 3)
+    {
+        outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
+        inTypeName = str_sprintf("%s%d", typeName, (int)inVectorSize);
+        std::snprintf(store_str, sizeof(store_str), store_pattern);
+        std::snprintf(load_str, sizeof(load_str), load_pattern);
     }
+
+    std::string kernel_src = concat_kernel(
+        pattern_local, sizeof(pattern_local) / sizeof(pattern_local[0]));
+    destBuffer = str_sprintf(kernel_src, inTypeName.c_str(), inTypeName.c_str(),
+                             outTypeName.c_str(), (int)inBufferSize, typeName,
+                             (int)inVectorSize, (int)inVectorSize, typeName);
 }
 
 int test_vload_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
@@ -360,53 +378,34 @@
     cl_ulong localSize;
     int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
     test_error( error, "Unable to get max size of local memory buffer" );
-    if( localSize > 10240 )
-        localSize = 10240;
+    if (localSize > 10240) localSize = 10240;
     if (localSize > 4096)
         localSize -= 2048;
     else
         localSize /= 2;
 
-    return test_vloadset( device, context, queue, create_local_load_code, (size_t)localSize );
+    return test_vset<test_vload>(device, context, queue, create_local_load_code,
+                                 (size_t)localSize);
 }
 
-
-void create_constant_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
+void create_constant_load_code(std::string &destBuffer, size_t inBufferSize,
+                               ExplicitType type, size_t inVectorSize,
+                               size_t outVectorSize)
 {
-    const char *pattern =
-    "%s%s"
-    "__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "    %s%d tmp = vload%d( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n"
-    "   results[ tid ] = tmp;\n"
-    "}\n";
-
-    const char *patternV3 =
-    "%s%s"
-    "__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "    %s3 tmp = vload3( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n"
-    "   results[ 3*tid   ] = tmp.s0;\n"
-    "   results[ 3*tid+1 ] = tmp.s1;\n"
-    "   results[ 3*tid+2 ] = tmp.s2;\n"
-    "}\n";
-
+    std::snprintf(mem_type, sizeof(mem_type), "__constant");
+    std::snprintf(store_str, sizeof(store_str), store_patternV3);
     const char *typeName = get_explicit_type_name(type);
-    if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, typeName,  typeName,
-                typeName );
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize,
-                (int)inVectorSize, typeName );
+    std::string outTypeName = typeName;
+    if (inVectorSize != 3)
+    {
+        outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
+        std::snprintf(store_str, sizeof(store_str), store_pattern);
     }
+
+    std::string kernel_src = concat_kernel(
+        kernel_pattern, sizeof(kernel_pattern) / sizeof(kernel_pattern[0]));
+    destBuffer = str_sprintf(kernel_src, typeName, outTypeName.c_str(),
+                             typeName, (int)inVectorSize, (int)inVectorSize);
 }
 
 int test_vload_constant(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
@@ -415,109 +414,71 @@
     cl_ulong maxSize;
     int error = clGetDeviceInfo( device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, NULL );
     test_error( error, "Unable to get max size of constant memory buffer" );
-    if( maxSize > 10240 )
-        maxSize = 10240;
+    if (maxSize > 10240) maxSize = 10240;
     if (maxSize > 4096)
         maxSize -= 2048;
     else
         maxSize /= 2;
 
-    return test_vloadset( device, context, queue, create_constant_load_code, (size_t)maxSize );
+    return test_vset<test_vload>(device, context, queue,
+                                 create_constant_load_code, (size_t)maxSize);
 }
 
-
-void create_private_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
+void create_private_load_code(std::string &destBuffer, size_t inBufferSize,
+                              ExplicitType type, size_t inVectorSize,
+                              size_t outVectorSize)
 {
-    const char *pattern =
-    "%s%s"
-    // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
-    // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
-    "#define PRIV_TYPE %s%d\n"
-    "#define PRIV_SIZE %d\n"
-    "__kernel void test_fn( __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
-    "{\n"
-    "    __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
-    "    int tid = get_global_id( 0 );\n"
-    "\n"
-    "    for( int i = 0; i < %d; i++ )\n"
-    "      sPrivateStorage[ i ] = src[ i ];\n"
-    //    Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
-    //  anybody else to sync up
-    "\n"
-    "    %s%d tmp = vload%d( offsets[ tid ], ( (__private %s *) sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
-    "   results[ tid ] = tmp;\n"
-    "}\n";
-
-    const char *patternV3 =
-    "%s%s"
-    // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
-    // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
-    "#define PRIV_TYPE %s\n"
-    "#define PRIV_SIZE %d\n"
-    "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
-    "{\n"
-    "    __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
-    "    int tid = get_global_id( 0 );\n"
-    "\n"
-    "    for( int i = 0; i < PRIV_SIZE; i++ )\n"
-    "    {\n"
-    "        sPrivateStorage[ i ] = src[ i ];\n"
-    "    }\n"
-    //    Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
-    //  anybody else to sync up
-    "\n"
-    "    %s3 tmp = vload3( offsets[ tid ], ( sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
-    "   results[ 3*tid   ] = tmp.s0;\n"
-    "   results[ 3*tid+1 ] = tmp.s1;\n"
-    "   results[ 3*tid+2 ] = tmp.s2;\n"
-    "}\n";
-
+    std::snprintf(store_str, sizeof(store_str), store_patternV3);
     const char *typeName = get_explicit_type_name(type);
-    if(inVectorSize ==3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, 3*((int)inBufferSize),
-                typeName, typeName,
-                typeName );
-        // log_info("Src is \"\n%s\n\"\n", destBuffer);
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, (int)inVectorSize, (int)inBufferSize,
-                typeName, (int)inVectorSize, typeName, (int)outVectorSize,
-                (int)inBufferSize,
-                typeName, (int)inVectorSize, (int)inVectorSize, typeName );
+    std::string outTypeName = typeName;
+    std::string inTypeName = typeName;
+    int bufSize = (int)inBufferSize * 3;
+    if (inVectorSize != 3)
+    {
+        outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
+        inTypeName = str_sprintf("%s%d", typeName, (int)inVectorSize);
+        bufSize = (int)inBufferSize;
+        std::snprintf(store_str, sizeof(store_str), store_pattern);
     }
+
+    std::string kernel_src = concat_kernel(
+        pattern_priv, sizeof(pattern_priv) / sizeof(pattern_priv[0]));
+    destBuffer = str_sprintf(kernel_src, inTypeName.c_str(), bufSize,
+                             inTypeName.c_str(), outTypeName.c_str(), typeName,
+                             (int)inVectorSize, (int)inVectorSize, typeName);
 }
 
 int test_vload_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
 {
     // We have no idea how much actual private storage is available, so just pick a reasonable value,
     // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
-    return test_vloadset( device, context, queue, create_private_load_code, 256 );
+    return test_vset<test_vload>(device, context, queue,
+                                 create_private_load_code, 256);
 }
 
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 #pragma mark -------------------- vstore harness --------------------------
 
-typedef void (*create_vstore_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize );
-
-int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize,
-                create_vstore_program_fn createFn, size_t bufferSize, MTdata d )
+int test_vstore(cl_device_id device, cl_context context, cl_command_queue queue,
+                ExplicitType type, unsigned int vecSize,
+                create_program_fn createFn, size_t bufferSize)
 {
-    int error;
-
     clProgramWrapper program;
     clKernelWrapper kernel;
     clMemWrapper streams[ 3 ];
+    MTdataHolder d(gRandomSeed);
 
     size_t threads[ 1 ], localThreads[ 1 ];
-
     size_t numElements, typeSize, numStores = (DEBUG) ? 16 : NUM_LOADS;
 
+    pragma_str[0] = '\0';
+    if (type == kDouble)
+        std::snprintf(pragma_str, sizeof(pragma_str),
+                      "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n");
+    else if (type == kHalf)
+        std::snprintf(pragma_str, sizeof(pragma_str),
+                      "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n");
+
     if (DEBUG)
         bufferSize = (bufferSize < 128) ? bufferSize : 128;
 
@@ -534,39 +495,22 @@
     }
     if (DEBUG)
         log_info("Testing: numStores: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numStores, (int)typeSize, vecSize, (int)numElements, (int)bufferSize);
-#if !(defined(_WIN32) && defined(_MSC_VER))
-    cl_uint offsets[ numStores ];
-#else
-    cl_uint* offsets = (cl_uint*)_malloca(numStores * sizeof(cl_uint));
-#endif
-    char programSrc[ 10240 ];
-    size_t i;
 
-#if !(defined(_WIN32) && defined(_MSC_VER))
-    char inBuffer[ numStores * typeSize * vecSize ];
-#else
-    char* inBuffer = (char*)_malloca( numStores * typeSize * vecSize * sizeof(cl_char));
-#endif
+    std::vector<cl_uint> offsets(numStores);
+    std::vector<char> inBuffer(numStores * typeSize * vecSize);
+
     clProtectedArray outBuffer( numElements * typeSize * vecSize );
-#if !(defined(_WIN32) && defined(_MSC_VER))
-    char referenceBuffer[ numElements * typeSize * vecSize ];
-#else
-    char* referenceBuffer = (char*)_malloca(numElements * typeSize * vecSize * sizeof(cl_char));
-#endif
+    std::vector<char> referenceBuffer(numElements * typeSize * vecSize);
 
     // Create some random input data and random offsets to load from
-    generate_random_data( type, numStores * vecSize, d, (void *)inBuffer );
+    generate_random_data(type, numStores * vecSize, d, (void *)inBuffer.data());
 
     // Note: make sure no two offsets are the same, otherwise the output would depend on
     // the order that threads ran in, and that would be next to impossible to verify
-#if !(defined(_WIN32) && defined(_MSC_VER))
-    char flags[ numElements ];
-#else
-    char* flags = (char*)_malloca( numElements * sizeof(char));
-#endif
+    std::vector<char> flags(numElements);
+    flags.assign(flags.size(), 0);
 
-    memset( flags, 0, numElements * sizeof(char) );
-    for( i = 0; i < numStores; i++ )
+    for (size_t i = 0; i < numStores; i++)
     {
         do
         {
@@ -579,13 +523,15 @@
     if (LINEAR_OFFSETS)
         log_info("Offsets set to thread IDs to simplify output.\n");
 
-    createFn( programSrc, numElements, type, vecSize );
+    std::string programSrc;
+    createFn(programSrc, numElements, type, vecSize, vecSize);
 
     // Create our kernel
-    const char *ptr = programSrc;
-    error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" );
+    const char *ptr = programSrc.c_str();
+    cl_int error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                               &ptr, "test_fn");
     test_error( error, "Unable to create testing kernel" );
-    if (DEBUG) log_info("Kernel: \n%s\n", programSrc);
+    if (DEBUG) log_info("Kernel: \n%s\n", programSrc.c_str());
 
     // Get the number of args to differentiate the kernels with local storage. (They have 5)
     cl_uint numArgs;
@@ -593,9 +539,14 @@
     test_error( error, "clGetKernelInfo failed");
 
     // Set up parameters
-    streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * typeSize * vecSize * sizeof(cl_char), (void *)inBuffer, &error );
+    streams[0] =
+        clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                       numStores * typeSize * vecSize * sizeof(cl_char),
+                       (void *)inBuffer.data(), &error);
     test_error( error, "Unable to create kernel stream" );
-    streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * sizeof(cl_uint), offsets, &error );
+    streams[1] =
+        clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                       numStores * sizeof(cl_uint), offsets.data(), &error);
     test_error( error, "Unable to create kernel stream" );
     streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numElements * typeSize * vecSize, (void *)outBuffer, &error );
     test_error( error, "Unable to create kernel stream" );
@@ -606,7 +557,7 @@
         // We need to set the size of the local storage
         error = clSetKernelArg(kernel, 0, bufferSize, NULL);
         test_error( error, "clSetKernelArg for buffer failed");
-        for( i = 0; i < 3; i++ )
+        for (size_t i = 0; i < 3; i++)
         {
             error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] );
             test_error( error, "Unable to set kernel argument" );
@@ -615,11 +566,10 @@
     else
     {
         // No local storage
-        for( i = 0; i < 3; i++ )
+        for (size_t i = 0; i < 3; i++)
         {
             error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] );
-            if (error)
-                log_info("%s\n", programSrc);
+            if (error) log_info("%s\n", programSrc.c_str());
             test_error( error, "Unable to set kernel argument" );
         }
     }
@@ -654,25 +604,26 @@
         error = clEnqueueReadBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL );
         test_error( error, "Unable to read results" );
 
-
         // Create the reference results
-        memset( referenceBuffer, 0, numElements * typeSize * vecSize * sizeof(cl_char) );
-        for( i = 0; i < numStores; i++ )
+        referenceBuffer.assign(referenceBuffer.size(), 0);
+        for (size_t i = 0; i < numStores; i++)
         {
-            memcpy( referenceBuffer + ( ( offsets[ i ] * vecSize ) + addressOffset ) * typeSize, inBuffer + i * typeSize * vecSize, typeSize * vecSize );
+            memcpy(&referenceBuffer[((offsets[i] * vecSize) + addressOffset)
+                                    * typeSize],
+                   &inBuffer[i * typeSize * vecSize], typeSize * vecSize);
         }
 
         // Validate the results now
-        char *expected = referenceBuffer;
+        char *expected = referenceBuffer.data();
         char *actual = (char *)(void *)outBuffer;
 
         if (DEBUG)
         {
             log_info("Memory contents:\n");
-            for (i=0; i<numElements; i++)
+            char inString[1024];
+            char expectedString[1024], actualString[1024];
+            for (size_t i = 0; i < numElements; i++)
             {
-                char  inString[1024];
-                char expectedString[ 1024 ], actualString[ 1024 ];
                 if (i < numStores)
                 {
                     log_info("buffer %3d: input: %s expected: %s got: %s (store offset %3d)", (int)i, GetDataVectorString( &(inBuffer[i*typeSize*vecSize]), typeSize, vecSize, inString ),
@@ -693,7 +644,7 @@
             }
         }
 
-        for( i = 0; i < numElements; i++ )
+        for (size_t i = 0; i < numElements; i++)
         {
             if( memcmp( expected, actual, typeSize * vecSize ) != 0 )
             {
@@ -719,62 +670,26 @@
             actual += typeSize * vecSize;
         }
     }
-
     return 0;
 }
 
-int test_vstoreset(cl_device_id device, cl_context context, cl_command_queue queue, create_vstore_program_fn createFn, size_t bufferSize )
-{
-    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
-    unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 };
-    const char *size_names[] = { "2", "3", "4", "8", "16"};
-    unsigned int typeIdx, sizeIdx;
-    int error = 0;
-    MTdata d = init_genrand( gRandomSeed );
-
-    log_info("Testing with buffer size of %d.\n", (int)bufferSize);
-
-    for( typeIdx = 0; vecType[ typeIdx ] != kNumExplicitTypes; typeIdx++ )
-    {
-        if( vecType[ typeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) )
-            continue;
-
-        if(( vecType[ typeIdx ] == kLong || vecType[ typeIdx ] == kULong ) && !gHasLong )
-            continue;
-
-        for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ )
-        {
-            log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]);
-
-            int error_this_type = test_vstore( device, context, queue, vecType[ typeIdx ], vecSizes[ sizeIdx ], createFn, bufferSize, d );
-            if (error_this_type)
-            {
-                log_error("Failure; skipping further sizes for this type.\n");
-                error += error_this_type;
-                break;
-            }
-        }
-    }
-
-    free_mtdata(d);
-    return error;
-}
-
-
 #pragma mark -------------------- vstore test cases --------------------------
 
-void create_global_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
+void create_global_store_code(std::string &destBuffer, size_t inBufferSize,
+                              ExplicitType type, size_t inVectorSize,
+                              size_t /*unused*/)
 {
-    const char *pattern =
-    "%s"
+    // clang-format off
+    const char *pattern [] = {
+    pragma_str,
     "__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
     "{\n"
     "    int tid = get_global_id( 0 );\n"
     "    vstore%d( srcValues[ tid ], offsets[ tid ], destBuffer + alignmentOffset );\n"
-    "}\n";
+    "}\n" };
 
-    const char *patternV3 =
-    "%s"
+    const char *patternV3 [] = {
+    pragma_str,
     "__kernel void test_fn( __global %s3 *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
     "{\n"
     "    int tid = get_global_id( 0 );\n"
@@ -783,45 +698,48 @@
     "    } else {\n"
     "      vstore3( vload3(tid, (__global %s *)srcValues), offsets[ tid ], destBuffer + alignmentOffset );\n"
     "    }\n"
-    "}\n";
+    "}\n" };
+    // clang-format on
 
     const char *typeName = get_explicit_type_name(type);
-
     if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName, typeName, typeName);
-
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName, (int)inVectorSize, typeName, (int)inVectorSize );
+        std::string kernel_src =
+            concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
+        destBuffer = str_sprintf(kernel_src, typeName, typeName, typeName);
     }
-    // if(inVectorSize == 3 || inVectorSize == 4) {
-    //     log_info("\n----\n%s\n----\n", destBuffer);
-    // }
+    else
+    {
+        std::string kernel_src =
+            concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
+        destBuffer = str_sprintf(kernel_src, typeName, (int)inVectorSize,
+                                 typeName, (int)inVectorSize);
+    }
 }
 
 int test_vstore_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
 {
-    return test_vstoreset( device, context, queue, create_global_store_code, 10240 );
+    return test_vset<test_vstore>(device, context, queue,
+                                  create_global_store_code, 10240);
 }
 
-
-void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
+void create_local_store_code(std::string &destBuffer, size_t inBufferSize,
+                             ExplicitType type, size_t inVectorSize,
+                             size_t /*unused*/)
 {
-    const char *pattern =
-    "%s"
-    "\n"
-    "__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n"
+    // clang-format off
+    const char *pattern[] = {
+    pragma_str,
+    "#define LOC_TYPE %s\n"
+    "#define LOC_VTYPE %s%d\n"
+    "__kernel void test_fn(__local LOC_VTYPE *sSharedStorage, __global LOC_VTYPE *srcValues, __global uint *offsets, __global LOC_VTYPE *destBuffer, uint alignmentOffset )\n"
     "{\n"
     "    int tid = get_global_id( 0 );\n"
     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
-    " sSharedStorage[ offsets[tid] ] = (%s%d)(%s)0;\n"
+    " sSharedStorage[ offsets[tid] ] = (LOC_VTYPE)(LOC_TYPE)0;\n"
     " sSharedStorage[ offsets[tid] +1 ] =  sSharedStorage[ offsets[tid] ];\n"
     "   barrier( CLK_LOCAL_MEM_FENCE );\n"
     "\n"
-    "    vstore%d( srcValues[ tid ], offsets[ tid ], ( (__local %s *)sSharedStorage ) + alignmentOffset );\n"
+    "    vstore%d( srcValues[ tid ], offsets[ tid ], ( (__local LOC_TYPE *)sSharedStorage ) + alignmentOffset );\n"
     "\n"
     // Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output
     // buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy
@@ -830,20 +748,20 @@
     // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
     // otherwise, local threads would be overwriting results from other local threads
     "  int i;\n"
-    "  __local %s *sp = (__local %s*) (sSharedStorage + offsets[tid]) + alignmentOffset;\n"
-    "  __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
+    "  __local LOC_TYPE *sp = (__local LOC_TYPE*) (sSharedStorage + offsets[tid]) + alignmentOffset;\n"
+    "  __global LOC_TYPE *dp = (__global LOC_TYPE*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
     "  for( i = 0; (size_t)i < sizeof( sSharedStorage[0]) / sizeof( *sp ); i++ ) \n"
     "       dp[i] = sp[i];\n"
-    "}\n";
+    "}\n" };
 
-    const char *patternV3 =
-    "%s"
-    "\n"
-    "__kernel void test_fn(__local %s *sSharedStorage, __global %s *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
+    const char *patternV3 [] = {
+    pragma_str,
+    "#define LOC_TYPE %s\n"
+    "__kernel void test_fn(__local LOC_TYPE *sSharedStorage, __global LOC_TYPE *srcValues, __global uint *offsets, __global LOC_TYPE *destBuffer, uint alignmentOffset )\n"
     "{\n"
     "    int tid = get_global_id( 0 );\n"
     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
-    "    sSharedStorage[ 3*offsets[tid]   ] = (%s)0;\n"
+    "    sSharedStorage[ 3*offsets[tid]   ] = (LOC_TYPE)0;\n"
     "    sSharedStorage[ 3*offsets[tid] +1 ] =  \n"
     "        sSharedStorage[ 3*offsets[tid] ];\n"
     "    sSharedStorage[ 3*offsets[tid] +2 ] =  \n"
@@ -865,30 +783,26 @@
     // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
     // otherwise, local threads would be overwriting results from other local threads
     "  int i;\n"
-    "  __local %s *sp =  (sSharedStorage + 3*offsets[tid]) + alignmentOffset;\n"
-    "  __global %s *dp = (destBuffer + 3*offsets[tid]) + alignmentOffset;\n"
+    "  __local LOC_TYPE *sp =  (sSharedStorage + 3*offsets[tid]) + alignmentOffset;\n"
+    "  __global LOC_TYPE *dp = (destBuffer + 3*offsets[tid]) + alignmentOffset;\n"
     "  for( i = 0; i < 3; i++ ) \n"
     "       dp[i] = sp[i];\n"
-    "}\n";
+    "}\n" };
+    // clang-format on
 
     const char *typeName = get_explicit_type_name(type);
     if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName,
-                typeName,
-                typeName,  typeName,
-                typeName, typeName, typeName  );
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName, (int)inVectorSize,
-                typeName, (int)inVectorSize, typeName, (int)inVectorSize,
-                typeName, (int)inVectorSize, typeName,
-                (int)inVectorSize, typeName, typeName,
-                typeName, typeName, typeName  );
+        std::string kernel_src =
+            concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
+        destBuffer = str_sprintf(kernel_src, typeName);
     }
-    // log_info(destBuffer);
+    else
+    {
+        std::string kernel_src =
+            concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
+        destBuffer = str_sprintf(kernel_src, typeName, typeName,
+                                 (int)inVectorSize, (int)inVectorSize);
+    }
 }
 
 int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
@@ -897,81 +811,82 @@
     cl_ulong localSize;
     int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
     test_error( error, "Unable to get max size of local memory buffer" );
-    if( localSize > 10240 )
-        localSize = 10240;
+    if (localSize > 10240) localSize = 10240;
     if (localSize > 4096)
         localSize -= 2048;
     else
         localSize /= 2;
-    return test_vstoreset( device, context, queue, create_local_store_code, (size_t)localSize );
+    return test_vset<test_vstore>(device, context, queue,
+                                  create_local_store_code, (size_t)localSize);
 }
 
-
-void create_private_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
+void create_private_store_code(std::string &destBuffer, size_t inBufferSize,
+                               ExplicitType type, size_t inVectorSize,
+                               size_t /*unused*/)
 {
-    const char *pattern =
-    "%s"
+    // clang-format off
+    const char *pattern [] = {
+    pragma_str,
+    "#define PRIV_TYPE %s\n"
+    "#define PRIV_VTYPE %s%d\n"
     // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
     // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
     "\n"
-    "__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n"
+    "__kernel void test_fn( __global PRIV_VTYPE *srcValues, __global uint *offsets, __global PRIV_VTYPE *destBuffer, uint alignmentOffset )\n"
     "{\n"
-    "    __private %s%d sPrivateStorage[ %d ];\n"
-    "    int tid = get_global_id( 0 );\n"
+    "  __private PRIV_VTYPE sPrivateStorage[ %d ];\n"
+    "  int tid = get_global_id( 0 );\n"
     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
-    " sPrivateStorage[tid] = (%s%d)(%s)0;\n"
+    " sPrivateStorage[tid] = (PRIV_VTYPE)(PRIV_TYPE)0;\n"
     "\n"
-    "   vstore%d( srcValues[ tid ], offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n"
+    "  vstore%d( srcValues[ tid ], offsets[ tid ], ( (__private PRIV_TYPE *)sPrivateStorage ) + alignmentOffset );\n"
     "\n"
     // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
     // otherwise, local threads would be overwriting results from other local threads
     "  uint i;\n"
-    "  __private %s *sp = (__private %s*) (sPrivateStorage + offsets[tid]) + alignmentOffset;\n"
-    "  __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
+    "  __private PRIV_TYPE *sp = (__private PRIV_TYPE*) (sPrivateStorage + offsets[tid]) + alignmentOffset;\n"
+    "  __global PRIV_TYPE *dp = (__global PRIV_TYPE*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
     "  for( i = 0; i < sizeof( sPrivateStorage[0]) / sizeof( *sp ); i++ ) \n"
     "       dp[i] = sp[i];\n"
-    "}\n";
+    "}\n"};
 
-
-    const char *patternV3 =
-    "%s"
+    const char *patternV3  [] = {
+    pragma_str,
+    "#define PRIV_TYPE %s\n"
+    "#define PRIV_VTYPE %s3\n"
     // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
     // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
     "\n"
-    "__kernel void test_fn( __global %s *srcValues, __global uint *offsets, __global %s3 *destBuffer, uint alignmentOffset )\n"
+    "__kernel void test_fn( __global PRIV_TYPE *srcValues, __global uint *offsets, __global PRIV_VTYPE *destBuffer, uint alignmentOffset )\n"
     "{\n"
-    "    __private %s3 sPrivateStorage[ %d ];\n" // keep this %d
-    "    int tid = get_global_id( 0 );\n"
+    "  __private PRIV_VTYPE sPrivateStorage[ %d ];\n" // keep this %d
+    "  int tid = get_global_id( 0 );\n"
     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
-    " sPrivateStorage[tid] = (%s3)(%s)0;\n"
+    " sPrivateStorage[tid] = (PRIV_VTYPE)(PRIV_TYPE)0;\n"
     "\n"
-
-    "   vstore3( vload3(tid,srcValues), offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n"
-    "\n"
-    // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
-    // otherwise, local threads would be overwriting results from other local threads
+    "  vstore3( vload3(tid,srcValues), offsets[ tid ], ( (__private PRIV_TYPE *)sPrivateStorage ) + alignmentOffset );\n"
     "  uint i;\n"
-    "  __private %s *sp = ((__private %s*) sPrivateStorage) + 3*offsets[tid] + alignmentOffset;\n"
-    "  __global %s *dp = ((__global %s*) destBuffer) + 3*offsets[tid] + alignmentOffset;\n"
+    "  __private PRIV_TYPE *sp = ((__private PRIV_TYPE*) sPrivateStorage) + 3*offsets[tid] + alignmentOffset;\n"
+    "  __global PRIV_TYPE *dp = ((__global PRIV_TYPE*) destBuffer) + 3*offsets[tid] + alignmentOffset;\n"
     "  for( i = 0; i < 3; i++ ) \n"
     "       dp[i] = sp[i];\n"
-    "}\n";
+    "}\n"};
+    // clang-format on
 
     const char *typeName = get_explicit_type_name(type);
     if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName,  typeName,
-                typeName, (int)inBufferSize,
-                typeName, typeName,
-                typeName, typeName, typeName, typeName, typeName );
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName, (int)inVectorSize, typeName, (int)inVectorSize,
-                typeName, (int)inVectorSize, (int)inBufferSize,
-                typeName, (int)inVectorSize, typeName,
-                (int)inVectorSize, typeName, typeName, typeName, typeName, typeName );
+        std::string kernel_src =
+            concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
+        destBuffer =
+            str_sprintf(kernel_src, typeName, typeName, (int)inBufferSize);
+    }
+    else
+    {
+        std::string kernel_src =
+            concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
+        destBuffer =
+            str_sprintf(kernel_src, typeName, typeName, (int)inVectorSize,
+                        (int)inBufferSize, (int)inVectorSize);
     }
 }
 
@@ -979,7 +894,8 @@
 {
     // We have no idea how much actual private storage is available, so just pick a reasonable value,
     // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
-    return test_vstoreset( device, context, queue, create_private_store_code, 256 );
+    return test_vset<test_vstore>(device, context, queue,
+                                  create_private_store_code, 256);
 }
 
 
diff --git a/test_conformance/basic/test_wg_barrier.cpp b/test_conformance/basic/test_wg_barrier.cpp
deleted file mode 100644
index a237d80..0000000
--- a/test_conformance/basic/test_wg_barrier.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "harness/compat.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-
-#include "procs.h"
-
-const char *wg_barrier_kernel_code =
-"__kernel void compute_sum(__global int *a, int n, __global int *tmp_sum, __global int *sum)\n"
-"{\n"
-"    int  tid = get_local_id(0);\n"
-"    int  lsize = get_local_size(0);\n"
-"    int  i;\n"
-"\n"
-"    tmp_sum[tid] = 0;\n"
-"    for (i=tid; i<n; i+=lsize)\n"
-"        tmp_sum[tid] += a[i];\n"
-"     \n"
-"     // updated to work for any workgroup size \n"
-"    for (i=hadd(lsize,1); lsize>1; i = hadd(i,1))\n"
-"    {\n"
-"        work_group_barrier(CLK_GLOBAL_MEM_FENCE);\n"
-"        if (tid + i < lsize)\n"
-"            tmp_sum[tid] += tmp_sum[tid + i];\n"
-"         lsize = i; \n"
-"    }\n"
-"\n"
-"     //no barrier is required here because last person to write to tmp_sum[0] was tid 0 \n"
-"    if (tid == 0)\n"
-"        *sum = tmp_sum[0];\n"
-"}\n";
-
-
-static int
-verify_sum(int *inptr, int *tmpptr, int *outptr, int n)
-{
-    int i;
-    int reference = 0;
-
-    for (i=0; i<n; i++)
-    {
-        reference += inptr[i];
-    }
-
-    if (reference != outptr[0])
-    {
-        log_error("work_group_barrier test failed\n");
-        return -1;
-    }
-
-    log_info("work_group_barrier test passed\n");
-    return 0;
-}
-
-
-int
-test_wg_barrier(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    cl_mem            streams[3];
-    cl_int            *input_ptr = NULL, *output_ptr = NULL, *tmp_ptr =NULL;
-    cl_program        program;
-    cl_kernel        kernel;
-    size_t    global_threads[3];
-    size_t    local_threads[3];
-    int                err;
-    int                i;
-    size_t max_local_workgroup_size[3];
-    size_t max_threadgroup_size = 0;
-    MTdata d;
-
-    err = create_single_kernel_helper_with_build_options(
-        context, &program, &kernel, 1, &wg_barrier_kernel_code, "compute_sum",
-        nullptr);
-    test_error(err, "Failed to build kernel/program.");
-
-    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
-                                 sizeof(max_threadgroup_size), &max_threadgroup_size, NULL);
-    test_error(err, "clGetKernelWorkgroupInfo failed.");
-
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL);
-    test_error(err, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
-
-    // Pick the minimum of the device and the kernel
-    if (max_threadgroup_size > max_local_workgroup_size[0])
-        max_threadgroup_size = max_local_workgroup_size[0];
-
-    // work group size must divide evenly into the global size
-    while( num_elements % max_threadgroup_size )
-        max_threadgroup_size--;
-
-    input_ptr = (int*)malloc(sizeof(int) * num_elements);
-    output_ptr = (int*)malloc(sizeof(int));
-
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, &err);
-    test_error(err, "clCreateBuffer failed.");
-    streams[1] =
-        clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &err);
-    test_error(err, "clCreateBuffer failed.");
-    streams[2] =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(cl_int) * max_threadgroup_size, NULL, &err);
-    test_error(err, "clCreateBuffer failed.");
-
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        input_ptr[i] = (int)get_random_float(-0x01000000, 0x01000000, d);
-    free_mtdata(d);  d = NULL;
-
-    err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, sizeof(cl_int)*num_elements, (void *)input_ptr, 0, NULL, NULL);
-    test_error(err, "clEnqueueWriteBuffer failed.");
-
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof num_elements, &num_elements);
-    err |= clSetKernelArg(kernel, 2, sizeof streams[2], &streams[2]);
-    err |= clSetKernelArg(kernel, 3, sizeof streams[1], &streams[1]);
-    test_error(err, "clSetKernelArg failed.");
-
-    global_threads[0] = max_threadgroup_size;
-    local_threads[0] = max_threadgroup_size;
-
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, global_threads, local_threads, 0, NULL, NULL );
-    test_error(err, "clEnqueueNDRangeKernel failed.");
-
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int), (void *)output_ptr, 0, NULL, NULL );
-    test_error(err, "clEnqueueReadBuffer failed.");
-
-    err = verify_sum(input_ptr, tmp_ptr, output_ptr, num_elements);
-
-    // cleanup
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseMemObject(streams[2]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr);
-    free(output_ptr);
-
-    return err;
-}
diff --git a/test_conformance/basic/test_work_item_functions.cpp b/test_conformance/basic/test_work_item_functions.cpp
index d95915c..9683a83 100644
--- a/test_conformance/basic/test_work_item_functions.cpp
+++ b/test_conformance/basic/test_work_item_functions.cpp
@@ -91,7 +91,6 @@
     {
         for( int i = 0; i < NUM_TESTS; i++  )
         {
-            size_t numItems = 1;
             for( size_t j = 0; j < dim; j++ )
             {
                 // All of our thread sizes should be within the max local sizes, since they're all <= 20
@@ -100,8 +99,6 @@
                 while( localThreads[ j ] > 1 && ( threads[ j ] % localThreads[ j ] != 0 ) )
                     localThreads[ j ]--;
 
-                numItems *= threads[ j ];
-
                 // Hack for now: localThreads > 1 are iffy
                 localThreads[ j ] = 1;
             }
diff --git a/test_conformance/c11_atomics/CMakeLists.txt b/test_conformance/c11_atomics/CMakeLists.txt
index 621adda..0d389bc 100644
--- a/test_conformance/c11_atomics/CMakeLists.txt
+++ b/test_conformance/c11_atomics/CMakeLists.txt
@@ -7,4 +7,6 @@
     test_atomics.cpp
 )
 
+set_gnulike_module_compile_flags("-Wno-sign-compare")
+
 include(../CMakeCommon.txt)
diff --git a/test_conformance/c11_atomics/common.h b/test_conformance/c11_atomics/common.h
index 6c7d0b1..37c37e8 100644
--- a/test_conformance/c11_atomics/common.h
+++ b/test_conformance/c11_atomics/common.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _COMMON_H_
-#define _COMMON_H_
+#ifndef COMMON_H_
+#define COMMON_H_
 
 #include "harness/testHarness.h"
 #include "harness/typeWrappers.h"
@@ -1567,4 +1567,4 @@
     return 0;
 }
 
-#endif //_COMMON_H_
+#endif // COMMON_H_
diff --git a/test_conformance/c11_atomics/host_atomics.h b/test_conformance/c11_atomics/host_atomics.h
index 6c4e783..b865970 100644
--- a/test_conformance/c11_atomics/host_atomics.h
+++ b/test_conformance/c11_atomics/host_atomics.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _HOST_ATOMICS_H_
-#define _HOST_ATOMICS_H_
+#ifndef HOST_ATOMICS_H_
+#define HOST_ATOMICS_H_
 
 #include "harness/testHarness.h"
 
@@ -247,4 +247,4 @@
 bool host_atomic_flag_test_and_set(volatile HOST_ATOMIC_FLAG *a, TExplicitMemoryOrderType order);
 void host_atomic_flag_clear(volatile HOST_ATOMIC_FLAG *a, TExplicitMemoryOrderType order);
 
-#endif //_HOST_ATOMICS_H_
+#endif // HOST_ATOMICS_H_
diff --git a/test_conformance/c11_atomics/test_atomics.cpp b/test_conformance/c11_atomics/test_atomics.cpp
index d905b2c..ca2c224 100644
--- a/test_conformance/c11_atomics/test_atomics.cpp
+++ b/test_conformance/c11_atomics/test_atomics.cpp
@@ -3145,7 +3145,7 @@
     }
 
 private:
-    int _subCaseId;
+    size_t _subCaseId;
     struct TestDefinition _subCase;
 };
 
diff --git a/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.cpp b/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.cpp
index 0a459e9..5d0e99e 100644
--- a/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.cpp
+++ b/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.cpp
@@ -19,7 +19,7 @@
 #include "harness/errorHelpers.h"
 #include "harness/deviceInfo.h"
 #include <assert.h>
-#include <iostream>
+#include <algorithm>
 #include <stdexcept>
 
 #define ASSERT(x) assert((x))
@@ -740,21 +740,42 @@
     cl_int err = 0;
     cl_device_id devList[] = { deviceId, NULL };
 
-#ifdef _WIN32
-    if (!is_extension_available(devList[0], "cl_khr_external_semaphore_win32"))
+    switch (externalSemaphoreHandleType)
     {
-        throw std::runtime_error("Device does not support "
-                                 "cl_khr_external_semaphore_win32 extension\n");
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD:
+            if (!is_extension_available(devList[0],
+                                        "cl_khr_external_semaphore_opaque_fd"))
+            {
+                throw std::runtime_error("Device does not support "
+                                         "cl_khr_external_semaphore_opaque_fd "
+                                         "extension \n");
+            }
+            break;
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT:
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT:
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT_KMT:
+            if (!is_extension_available(devList[0],
+                                        "cl_khr_external_semaphore_win32"))
+            {
+                throw std::runtime_error(
+                    "Device does not support "
+                    "cl_khr_external_semaphore_win32 extension\n");
+            }
+            break;
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD:
+            if (!is_extension_available(devList[0],
+                                        "cl_khr_external_semaphore_sync_fd"))
+            {
+                throw std::runtime_error(
+                    "Device does not support cl_khr_external_semaphore_sync_fd "
+                    "extension \n");
+            }
+            break;
+        default:
+            throw std::runtime_error(
+                "Unsupported external semaphore handle type\n");
+            break;
     }
-#elif !defined(__APPLE__)
-    if (!is_extension_available(devList[0],
-                                "cl_khr_external_semaphore_opaque_fd"))
-    {
-        throw std::runtime_error(
-            "Device does not support cl_khr_external_semaphore_opaque_fd "
-            "extension \n");
-    }
-#endif
 
     std::vector<cl_semaphore_properties_khr> sema_props{
         (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_KHR,
@@ -803,6 +824,16 @@
             sema_props.push_back((cl_semaphore_properties_khr)handle);
 #endif
             break;
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD:
+            err = check_external_semaphore_handle_type(
+                devList[0], CL_SEMAPHORE_HANDLE_SYNC_FD_KHR);
+            sema_props.push_back(static_cast<cl_semaphore_properties_khr>(
+                CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR));
+            sema_props.push_back(static_cast<cl_semaphore_properties_khr>(
+                CL_SEMAPHORE_HANDLE_SYNC_FD_KHR));
+            sema_props.push_back(static_cast<cl_semaphore_properties_khr>(
+                CL_SEMAPHORE_EXPORT_HANDLE_TYPES_LIST_END_KHR));
+            break;
         default:
             ASSERT(0);
             log_error("Unsupported external memory handle type\n");
@@ -856,3 +887,67 @@
 {
     return m_externalSemaphore;
 }
+
+cl_external_memory_handle_type_khr vkToOpenCLExternalMemoryHandleType(
+    VulkanExternalMemoryHandleType vkExternalMemoryHandleType)
+{
+    switch (vkExternalMemoryHandleType)
+    {
+        case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD:
+            return CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR;
+        case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT:
+            return CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR;
+        case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT:
+        case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT_KMT:
+            return CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR;
+        case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE: return 0;
+    }
+    return 0;
+}
+
+VulkanImageTiling vkClExternalMemoryHandleTilingAssumption(
+    cl_device_id deviceId,
+    VulkanExternalMemoryHandleType vkExternalMemoryHandleType, int *error_ret)
+{
+    size_t size = 0;
+    VulkanImageTiling mode = VULKAN_IMAGE_TILING_OPTIMAL;
+
+    assert(error_ret
+           != nullptr); // errcode_ret is not optional, it must be checked
+
+    *error_ret = clGetDeviceInfo(
+        deviceId,
+        CL_DEVICE_EXTERNAL_MEMORY_IMPORT_ASSUME_LINEAR_IMAGES_HANDLE_TYPES_KHR,
+        0, nullptr, &size);
+    if (*error_ret != CL_SUCCESS)
+    {
+        return mode;
+    }
+
+    if (size == 0)
+    {
+        return mode;
+    }
+
+    std::vector<cl_external_memory_handle_type_khr> assume_linear_types(
+        size / sizeof(cl_external_memory_handle_type_khr));
+
+    *error_ret = clGetDeviceInfo(
+        deviceId,
+        CL_DEVICE_EXTERNAL_MEMORY_IMPORT_ASSUME_LINEAR_IMAGES_HANDLE_TYPES_KHR,
+        size, assume_linear_types.data(), nullptr);
+    if (*error_ret != CL_SUCCESS)
+    {
+        return mode;
+    }
+
+    if (std::find(
+            assume_linear_types.begin(), assume_linear_types.end(),
+            vkToOpenCLExternalMemoryHandleType(vkExternalMemoryHandleType))
+        != assume_linear_types.end())
+    {
+        mode = VULKAN_IMAGE_TILING_LINEAR;
+    }
+
+    return mode;
+}
diff --git a/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.hpp b/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.hpp
index 5143332..4a1d453 100644
--- a/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.hpp
+++ b/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.hpp
@@ -129,4 +129,8 @@
 
 extern void init_cl_vk_ext(cl_platform_id);
 
+VulkanImageTiling vkClExternalMemoryHandleTilingAssumption(
+    cl_device_id deviceId,
+    VulkanExternalMemoryHandleType vkExternalMemoryHandleType, int *error_ret);
+
 #endif // _opencl_vulkan_wrapper_hpp_
diff --git a/test_conformance/common/vulkan_wrapper/vulkan_api_list.hpp b/test_conformance/common/vulkan_wrapper/vulkan_api_list.hpp
index c62a71e..e9c06f9 100644
--- a/test_conformance/common/vulkan_wrapper/vulkan_api_list.hpp
+++ b/test_conformance/common/vulkan_wrapper/vulkan_api_list.hpp
@@ -75,6 +75,7 @@
     VK_FUNC_DECL(vkDestroyImageView)                                           \
     VK_FUNC_DECL(vkCreateImage)                                                \
     VK_FUNC_DECL(vkGetImageMemoryRequirements)                                 \
+    VK_FUNC_DECL(vkGetImageMemoryRequirements2)                                \
     VK_FUNC_DECL(vkDestroyImage)                                               \
     VK_FUNC_DECL(vkDestroyBuffer)                                              \
     VK_FUNC_DECL(vkDestroyPipeline)                                            \
@@ -87,8 +88,9 @@
     VK_FUNC_DECL(vkDestroyDescriptorSetLayout)                                 \
     VK_FUNC_DECL(vkGetPhysicalDeviceQueueFamilyProperties)                     \
     VK_FUNC_DECL(vkGetPhysicalDeviceFeatures)                                  \
-    VK_FUNC_DECL(vkGetPhysicalDeviceProperties2KHR)                            \
+    VK_FUNC_DECL(vkGetPhysicalDeviceProperties2)                               \
     VK_FUNC_DECL(vkGetBufferMemoryRequirements)                                \
+    VK_FUNC_DECL(vkGetBufferMemoryRequirements2)                               \
     VK_FUNC_DECL(vkGetMemoryFdKHR)                                             \
     VK_FUNC_DECL(vkGetSemaphoreFdKHR)                                          \
     VK_FUNC_DECL(vkEnumeratePhysicalDeviceGroups)                              \
@@ -160,6 +162,7 @@
 #define vkDestroyImageView _vkDestroyImageView
 #define vkCreateImage _vkCreateImage
 #define vkGetImageMemoryRequirements _vkGetImageMemoryRequirements
+#define vkGetImageMemoryRequirements2 _vkGetImageMemoryRequirements2
 #define vkDestroyImage _vkDestroyImage
 #define vkDestroyBuffer _vkDestroyBuffer
 #define vkDestroyPipeline _vkDestroyPipeline
@@ -173,8 +176,9 @@
 #define vkGetPhysicalDeviceQueueFamilyProperties                               \
     _vkGetPhysicalDeviceQueueFamilyProperties
 #define vkGetPhysicalDeviceFeatures _vkGetPhysicalDeviceFeatures
-#define vkGetPhysicalDeviceProperties2KHR _vkGetPhysicalDeviceProperties2KHR
+#define vkGetPhysicalDeviceProperties2 _vkGetPhysicalDeviceProperties2
 #define vkGetBufferMemoryRequirements _vkGetBufferMemoryRequirements
+#define vkGetBufferMemoryRequirements2 _vkGetBufferMemoryRequirements2
 #define vkGetMemoryFdKHR _vkGetMemoryFdKHR
 #define vkGetSemaphoreFdKHR _vkGetSemaphoreFdKHR
 #define vkEnumeratePhysicalDeviceGroups _vkEnumeratePhysicalDeviceGroups
diff --git a/test_conformance/common/vulkan_wrapper/vulkan_list_map.cpp b/test_conformance/common/vulkan_wrapper/vulkan_list_map.cpp
index 4e27651..a5ca090 100644
--- a/test_conformance/common/vulkan_wrapper/vulkan_list_map.cpp
+++ b/test_conformance/common/vulkan_wrapper/vulkan_list_map.cpp
@@ -141,6 +141,16 @@
 
 VulkanDescriptorSetLayoutBindingList::VulkanDescriptorSetLayoutBindingList() {}
 
+void VulkanDescriptorSetLayoutBindingList::addBinding(
+    size_t binding, VulkanDescriptorType descriptorType,
+    uint32_t descriptorCount, VulkanShaderStage shaderStage)
+{
+    VulkanDescriptorSetLayoutBinding *descriptorSetLayoutBinding =
+        new VulkanDescriptorSetLayoutBinding(binding, descriptorType,
+                                             descriptorCount, shaderStage);
+    add(*descriptorSetLayoutBinding);
+}
+
 VulkanDescriptorSetLayoutBindingList::VulkanDescriptorSetLayoutBindingList(
     size_t numDescriptorSetLayoutBindings, VulkanDescriptorType descriptorType,
     uint32_t descriptorCount, VulkanShaderStage shaderStage)
@@ -268,6 +278,7 @@
     size_t numImages, std::vector<VulkanDeviceMemory *> &deviceMemory,
     uint64_t baseOffset, uint64_t interImageOffset, const VulkanDevice &device,
     VulkanFormat format, uint32_t width, uint32_t height, uint32_t mipLevels,
+    VulkanImageTiling vulkanImageTiling,
     VulkanExternalMemoryHandleType externalMemoryHandleType,
     VulkanImageCreateFlag imageCreateFlag, VulkanImageUsage imageUsage,
     VulkanSharingMode sharingMode)
@@ -275,8 +286,8 @@
     for (size_t i2DIdx = 0; i2DIdx < numImages; i2DIdx++)
     {
         VulkanImage2D *image2D = new VulkanImage2D(
-            device, format, width, height, mipLevels, externalMemoryHandleType,
-            imageCreateFlag, imageUsage, sharingMode);
+            device, format, width, height, vulkanImageTiling, mipLevels,
+            externalMemoryHandleType, imageCreateFlag, imageUsage, sharingMode);
         add(*image2D);
         deviceMemory[i2DIdx]->bindImage(
             *image2D, baseOffset + (i2DIdx * interImageOffset));
@@ -285,16 +296,16 @@
 
 VulkanImage2DList::VulkanImage2DList(
     size_t numImages, const VulkanDevice &device, VulkanFormat format,
-    uint32_t width, uint32_t height, uint32_t mipLevels,
-    VulkanExternalMemoryHandleType externalMemoryHandleType,
+    uint32_t width, uint32_t height, VulkanImageTiling vulkanImageTiling,
+    uint32_t mipLevels, VulkanExternalMemoryHandleType externalMemoryHandleType,
     VulkanImageCreateFlag imageCreateFlag, VulkanImageUsage imageUsage,
     VulkanSharingMode sharingMode)
 {
     for (size_t bIdx = 0; bIdx < numImages; bIdx++)
     {
         VulkanImage2D *image2D = new VulkanImage2D(
-            device, format, width, height, mipLevels, externalMemoryHandleType,
-            imageCreateFlag, imageUsage, sharingMode);
+            device, format, width, height, vulkanImageTiling, mipLevels,
+            externalMemoryHandleType, imageCreateFlag, imageUsage, sharingMode);
         add(*image2D);
     }
 }
diff --git a/test_conformance/common/vulkan_wrapper/vulkan_list_map.hpp b/test_conformance/common/vulkan_wrapper/vulkan_list_map.hpp
index 5220677..ef00b70 100644
--- a/test_conformance/common/vulkan_wrapper/vulkan_list_map.hpp
+++ b/test_conformance/common/vulkan_wrapper/vulkan_list_map.hpp
@@ -154,6 +154,10 @@
         VulkanDescriptorType descriptorType0, uint32_t descriptorCount0,
         VulkanDescriptorType descriptorType1, uint32_t descriptorCount1,
         VulkanShaderStage shaderStage = VULKAN_SHADER_STAGE_COMPUTE);
+    void
+    addBinding(size_t binding, VulkanDescriptorType descriptorType,
+               uint32_t descriptorCount,
+               VulkanShaderStage shaderStage = VULKAN_SHADER_STAGE_COMPUTE);
     virtual ~VulkanDescriptorSetLayoutBindingList();
 };
 
@@ -208,6 +212,7 @@
         uint64_t baseOffset, uint64_t interImageOffset,
         const VulkanDevice &device, VulkanFormat format, uint32_t width,
         uint32_t height, uint32_t mipLevels,
+        VulkanImageTiling vulkanImageTiling,
         VulkanExternalMemoryHandleType externalMemoryHandleType =
             VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE,
         VulkanImageCreateFlag imageCreateFlag = VULKAN_IMAGE_CREATE_FLAG_NONE,
@@ -216,7 +221,8 @@
         VulkanSharingMode sharingMode = VULKAN_SHARING_MODE_EXCLUSIVE);
     VulkanImage2DList(
         size_t numImages, const VulkanDevice &device, VulkanFormat format,
-        uint32_t width, uint32_t height, uint32_t mipLevels = 1,
+        uint32_t width, uint32_t height, VulkanImageTiling vulkanImageTiling,
+        uint32_t mipLevels = 1,
         VulkanExternalMemoryHandleType externalMemoryHandleType =
             VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE,
         VulkanImageCreateFlag imageCreateFlag = VULKAN_IMAGE_CREATE_FLAG_NONE,
diff --git a/test_conformance/common/vulkan_wrapper/vulkan_utility.cpp b/test_conformance/common/vulkan_wrapper/vulkan_utility.cpp
index 1a313cc..2124a27 100644
--- a/test_conformance/common/vulkan_wrapper/vulkan_utility.cpp
+++ b/test_conformance/common/vulkan_wrapper/vulkan_utility.cpp
@@ -21,6 +21,7 @@
 #include <fstream>
 #include <set>
 #include <string>
+#include <algorithm>
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
 #if defined(_WIN32) || defined(_WIN64)
@@ -248,6 +249,9 @@
     }
     externalSemaphoreHandleTypeList.push_back(
         VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT);
+#elif defined(__ANDROID__)
+    externalSemaphoreHandleTypeList.push_back(
+        VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD);
 #else
     externalSemaphoreHandleTypeList.push_back(
         VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD);
@@ -480,6 +484,33 @@
     return formatList;
 }
 
+cl_external_semaphore_handle_type_khr getCLSemaphoreTypeFromVulkanType(
+    VulkanExternalSemaphoreHandleType vulkanExternalSemaphoreHandleType)
+{
+    cl_external_semaphore_handle_type_khr clExternalSemaphoreHandleTypeKhr = 0;
+    switch (vulkanExternalSemaphoreHandleType)
+    {
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD:
+            clExternalSemaphoreHandleTypeKhr =
+                CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR;
+            break;
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT:
+            clExternalSemaphoreHandleTypeKhr =
+                CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR;
+            break;
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT_KMT:
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT:
+            clExternalSemaphoreHandleTypeKhr =
+                CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR;
+            break;
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD:
+            clExternalSemaphoreHandleTypeKhr = CL_SEMAPHORE_HANDLE_SYNC_FD_KHR;
+            break;
+        default: break;
+    }
+    return clExternalSemaphoreHandleTypeKhr;
+}
+
 uint32_t getVulkanFormatElementSize(VulkanFormat format)
 {
     switch (format)
diff --git a/test_conformance/common/vulkan_wrapper/vulkan_utility.hpp b/test_conformance/common/vulkan_wrapper/vulkan_utility.hpp
index 04f5a59..5128412 100644
--- a/test_conformance/common/vulkan_wrapper/vulkan_utility.hpp
+++ b/test_conformance/common/vulkan_wrapper/vulkan_utility.hpp
@@ -33,7 +33,8 @@
 const VulkanInstance& getVulkanInstance();
 const VulkanPhysicalDevice& getVulkanPhysicalDevice();
 const VulkanQueueFamily&
-getVulkanQueueFamily(uint32_t queueFlags = VULKAN_QUEUE_FLAG_MASK_ALL);
+getVulkanQueueFamily(uint32_t queueFlags = VULKAN_QUEUE_FLAG_GRAPHICS
+                         | VULKAN_QUEUE_FLAG_COMPUTE);
 const VulkanMemoryType&
 getVulkanMemoryType(const VulkanDevice& device,
                     VulkanMemoryTypeProperty memoryTypeProperty);
@@ -51,6 +52,8 @@
 uint32_t getVulkanFormatElementSize(VulkanFormat format);
 const char* getVulkanFormatGLSLFormat(VulkanFormat format);
 const char* getVulkanFormatGLSLTypePrefix(VulkanFormat format);
+cl_external_semaphore_handle_type_khr getCLSemaphoreTypeFromVulkanType(
+    VulkanExternalSemaphoreHandleType vulkanExternalSemaphoreHandleType);
 
 std::string prepareVulkanShader(
     std::string shaderCode,
diff --git a/test_conformance/common/vulkan_wrapper/vulkan_wrapper.cpp b/test_conformance/common/vulkan_wrapper/vulkan_wrapper.cpp
index 3ce4af6..73c5e9a 100644
--- a/test_conformance/common/vulkan_wrapper/vulkan_wrapper.cpp
+++ b/test_conformance/common/vulkan_wrapper/vulkan_wrapper.cpp
@@ -72,7 +72,9 @@
 
 #if defined(_WIN32) || defined(_WIN64)
     const char *vulkanLoaderLibraryName = "vulkan-1.dll";
-#elif defined(__linux__)
+#elif defined(__ANDROID__)
+    const char *vulkanLoaderLibraryName = "libvulkan.so";
+#else
     const char *vulkanLoaderLibraryName = "libvulkan.so.1";
 #endif
 #ifdef _WIN32
@@ -274,13 +276,13 @@
         VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR;
     vkPhysicalDeviceIDPropertiesKHR.pNext = NULL;
 
-    VkPhysicalDeviceProperties2KHR vkPhysicalDeviceProperties2KHR = {};
-    vkPhysicalDeviceProperties2KHR.sType =
+    VkPhysicalDeviceProperties2 vkPhysicalDeviceProperties2 = {};
+    vkPhysicalDeviceProperties2.sType =
         VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR;
-    vkPhysicalDeviceProperties2KHR.pNext = &vkPhysicalDeviceIDPropertiesKHR;
+    vkPhysicalDeviceProperties2.pNext = &vkPhysicalDeviceIDPropertiesKHR;
 
-    vkGetPhysicalDeviceProperties2KHR(m_vkPhysicalDevice,
-                                      &vkPhysicalDeviceProperties2KHR);
+    vkGetPhysicalDeviceProperties2(m_vkPhysicalDevice,
+                                   &vkPhysicalDeviceProperties2);
 
     memcpy(m_vkDeviceUUID, vkPhysicalDeviceIDPropertiesKHR.deviceUUID,
            sizeof(m_vkDeviceUUID));
@@ -605,6 +607,37 @@
 VulkanDevice::operator VkDevice() const { return m_vkDevice; }
 
 ////////////////////////////////
+// VulkanFence implementation //
+////////////////////////////////
+
+VulkanFence::VulkanFence(const VulkanDevice &vkDevice)
+{
+
+    device = vkDevice;
+
+    VkFenceCreateInfo fenceInfo{};
+    fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+    fenceInfo.pNext = nullptr;
+    fenceInfo.flags = 0;
+
+    VkResult vkStatus = vkCreateFence(device, &fenceInfo, nullptr, &fence);
+
+    if (vkStatus != VK_SUCCESS)
+    {
+        throw std::runtime_error("Error: Failed create fence.");
+    }
+}
+
+VulkanFence::~VulkanFence() { vkDestroyFence(device, fence, nullptr); }
+
+void VulkanFence::reset() { vkResetFences(device, 1, &fence); }
+
+void VulkanFence::wait()
+{
+    vkWaitForFences(device, 1, &fence, VK_TRUE, UINT64_MAX);
+}
+
+////////////////////////////////
 // VulkanQueue implementation //
 ////////////////////////////////
 
@@ -615,6 +648,22 @@
 
 VulkanQueue::~VulkanQueue() {}
 
+void VulkanQueue::submit(const VulkanCommandBuffer &commandBuffer,
+                         const std::shared_ptr<VulkanFence> &vkFence)
+{
+    VulkanCommandBufferList commandBufferList;
+    commandBufferList.add(commandBuffer);
+
+    VkSubmitInfo vkSubmitInfo = {};
+    vkSubmitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    vkSubmitInfo.pNext = NULL;
+    vkSubmitInfo.waitSemaphoreCount = (uint32_t)0;
+    vkSubmitInfo.commandBufferCount = (uint32_t)commandBufferList.size();
+    vkSubmitInfo.pCommandBuffers = commandBufferList();
+
+    vkQueueSubmit(m_vkQueue, 1, &vkSubmitInfo, vkFence->fence);
+}
+
 void VulkanQueue::submit(const VulkanSemaphoreList &waitSemaphoreList,
                          const VulkanCommandBufferList &commandBufferList,
                          const VulkanSemaphoreList &signalSemaphoreList)
@@ -964,12 +1013,14 @@
                 == vkDescriptorTypeToDescriptorCountMap.end())
             {
                 vkDescriptorTypeToDescriptorCountMap
-                    [vkDescriptorSetLayoutBinding.descriptorType] = 1;
+                    [vkDescriptorSetLayoutBinding.descriptorType] =
+                        vkDescriptorSetLayoutBinding.descriptorCount;
             }
             else
             {
                 vkDescriptorTypeToDescriptorCountMap
-                    [vkDescriptorSetLayoutBinding.descriptorType]++;
+                    [vkDescriptorSetLayoutBinding.descriptorType] +=
+                    vkDescriptorSetLayoutBinding.descriptorCount;
             }
         }
 
@@ -1110,6 +1161,35 @@
     vkUpdateDescriptorSets(m_device, 1, &vkWriteDescriptorSet, 0, NULL);
 }
 
+void VulkanDescriptorSet::updateArray(uint32_t binding, unsigned numBuffers,
+                                      const VulkanBufferList &buffers)
+{
+    VkDescriptorBufferInfo *vkDescriptorBufferInfo =
+        (VkDescriptorBufferInfo *)calloc(numBuffers,
+                                         sizeof(VkDescriptorBufferInfo));
+    for (unsigned i = 0; i < numBuffers; i++)
+    {
+        vkDescriptorBufferInfo[i].buffer = buffers[i];
+        vkDescriptorBufferInfo[i].offset = 0;
+        vkDescriptorBufferInfo[i].range = VK_WHOLE_SIZE;
+    }
+
+    VkWriteDescriptorSet vkWriteDescriptorSet = {};
+    vkWriteDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    vkWriteDescriptorSet.pNext = NULL;
+    vkWriteDescriptorSet.dstSet = m_vkDescriptorSet;
+    vkWriteDescriptorSet.dstBinding = binding;
+    vkWriteDescriptorSet.dstArrayElement = 0;
+    vkWriteDescriptorSet.descriptorCount = numBuffers;
+    vkWriteDescriptorSet.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    vkWriteDescriptorSet.pImageInfo = NULL;
+    vkWriteDescriptorSet.pBufferInfo = vkDescriptorBufferInfo;
+    vkWriteDescriptorSet.pTexelBufferView = NULL;
+
+    vkUpdateDescriptorSets(m_device, 1, &vkWriteDescriptorSet, 0, NULL);
+    free(vkDescriptorBufferInfo);
+}
+
 void VulkanDescriptorSet::update(uint32_t binding,
                                  const VulkanImageView &imageView)
 {
@@ -1133,6 +1213,34 @@
     vkUpdateDescriptorSets(m_device, 1, &vkWriteDescriptorSet, 0, NULL);
 }
 
+void VulkanDescriptorSet::updateArray(uint32_t binding,
+                                      const VulkanImageViewList &imageViewList)
+{
+    VkDescriptorImageInfo *vkDescriptorImageInfo =
+        new VkDescriptorImageInfo[imageViewList.size()];
+    for (size_t i = 0; i < imageViewList.size(); i++)
+    {
+        vkDescriptorImageInfo[i].sampler = VK_NULL_HANDLE;
+        vkDescriptorImageInfo[i].imageView = imageViewList[i];
+        vkDescriptorImageInfo[i].imageLayout = VK_IMAGE_LAYOUT_GENERAL;
+    }
+
+    VkWriteDescriptorSet vkWriteDescriptorSet = {};
+    vkWriteDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    vkWriteDescriptorSet.pNext = NULL;
+    vkWriteDescriptorSet.dstSet = m_vkDescriptorSet;
+    vkWriteDescriptorSet.dstBinding = binding;
+    vkWriteDescriptorSet.dstArrayElement = 0;
+    vkWriteDescriptorSet.descriptorCount = imageViewList.size();
+    vkWriteDescriptorSet.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
+    vkWriteDescriptorSet.pImageInfo = vkDescriptorImageInfo;
+    vkWriteDescriptorSet.pBufferInfo = NULL;
+    vkWriteDescriptorSet.pTexelBufferView = NULL;
+
+    vkUpdateDescriptorSets(m_device, 1, &vkWriteDescriptorSet, 0, NULL);
+    delete[] vkDescriptorImageInfo;
+}
+
 VulkanDescriptorSet::operator VkDescriptorSet() const
 {
     return m_vkDescriptorSet;
@@ -1456,12 +1564,14 @@
       m_memoryTypeList(buffer.m_memoryTypeList)
 {}
 
+bool VulkanBuffer::isDedicated() const { return m_dedicated; }
+
 VulkanBuffer::VulkanBuffer(
     const VulkanDevice &device, uint64_t size,
     VulkanExternalMemoryHandleType externalMemoryHandleType,
     VulkanBufferUsage bufferUsage, VulkanSharingMode sharingMode,
     const VulkanQueueFamilyList &queueFamilyList)
-    : m_device(device), m_vkBuffer(VK_NULL_HANDLE)
+    : m_device(device), m_vkBuffer(VK_NULL_HANDLE), m_dedicated(false)
 {
     std::vector<uint32_t> queueFamilyIndexList;
     if (queueFamilyList.size() == 0)
@@ -1507,16 +1617,36 @@
 
     vkCreateBuffer(m_device, &vkBufferCreateInfo, NULL, &m_vkBuffer);
 
-    VkMemoryRequirements vkMemoryRequirements = {};
-    vkGetBufferMemoryRequirements(m_device, m_vkBuffer, &vkMemoryRequirements);
-    m_size = vkMemoryRequirements.size;
-    m_alignment = vkMemoryRequirements.alignment;
+    VkMemoryDedicatedRequirements vkMemoryDedicatedRequirements = {};
+    vkMemoryDedicatedRequirements.sType =
+        VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS;
+    vkMemoryDedicatedRequirements.pNext = NULL;
+
+    VkMemoryRequirements2 vkMemoryRequirements = {};
+    vkMemoryRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2;
+    vkMemoryRequirements.pNext = &vkMemoryDedicatedRequirements;
+
+    VkBufferMemoryRequirementsInfo2 vkMemoryRequirementsInfo = {};
+
+    vkMemoryRequirementsInfo.sType =
+        VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2;
+    vkMemoryRequirementsInfo.buffer = m_vkBuffer;
+    vkMemoryRequirementsInfo.pNext = NULL;
+
+    vkGetBufferMemoryRequirements2(m_device, &vkMemoryRequirementsInfo,
+                                   &vkMemoryRequirements);
+
+    m_dedicated = vkMemoryDedicatedRequirements.requiresDedicatedAllocation;
+
+    m_size = vkMemoryRequirements.memoryRequirements.size;
+    m_alignment = vkMemoryRequirements.memoryRequirements.alignment;
     const VulkanMemoryTypeList &memoryTypeList =
         m_device.getPhysicalDevice().getMemoryTypeList();
     for (size_t mtIdx = 0; mtIdx < memoryTypeList.size(); mtIdx++)
     {
         uint32_t memoryTypeIndex = memoryTypeList[mtIdx];
-        if ((1 << memoryTypeIndex) & vkMemoryRequirements.memoryTypeBits)
+        if ((1 << memoryTypeIndex)
+            & vkMemoryRequirements.memoryRequirements.memoryTypeBits)
         {
             m_memoryTypeList.add(memoryTypeList[mtIdx]);
         }
@@ -1591,16 +1721,36 @@
 
     vkCreateImage(m_device, &vkImageCreateInfo, NULL, &m_vkImage);
     VulkanImageCreateInfo = vkImageCreateInfo;
-    VkMemoryRequirements vkMemoryRequirements = {};
-    vkGetImageMemoryRequirements(m_device, m_vkImage, &vkMemoryRequirements);
-    m_size = vkMemoryRequirements.size;
-    m_alignment = vkMemoryRequirements.alignment;
+
+    VkMemoryDedicatedRequirements vkMemoryDedicatedRequirements = {};
+    vkMemoryDedicatedRequirements.sType =
+        VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS;
+    vkMemoryDedicatedRequirements.pNext = NULL;
+
+    VkMemoryRequirements2 vkMemoryRequirements = {};
+    vkMemoryRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2;
+    vkMemoryRequirements.pNext = &vkMemoryDedicatedRequirements;
+
+    VkImageMemoryRequirementsInfo2 vkMemoryRequirementsInfo = {};
+
+    vkMemoryRequirementsInfo.sType =
+        VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2;
+    vkMemoryRequirementsInfo.image = m_vkImage;
+    vkMemoryRequirementsInfo.pNext = NULL;
+
+    vkGetImageMemoryRequirements2(m_device, &vkMemoryRequirementsInfo,
+                                  &vkMemoryRequirements);
+    m_size = vkMemoryRequirements.memoryRequirements.size;
+    m_alignment = vkMemoryRequirements.memoryRequirements.alignment;
+    m_dedicated = vkMemoryDedicatedRequirements.requiresDedicatedAllocation;
+
     const VulkanMemoryTypeList &memoryTypeList =
         m_device.getPhysicalDevice().getMemoryTypeList();
     for (size_t mtIdx = 0; mtIdx < memoryTypeList.size(); mtIdx++)
     {
         uint32_t memoryTypeIndex = memoryTypeList[mtIdx];
-        if ((1 << memoryTypeIndex) & vkMemoryRequirements.memoryTypeBits)
+        if ((1 << memoryTypeIndex)
+            & vkMemoryRequirements.memoryRequirements.memoryTypeBits)
         {
             m_memoryTypeList.add(memoryTypeList[mtIdx]);
         }
@@ -1629,6 +1779,8 @@
 
 uint64_t VulkanImage::getAlignment() const { return m_alignment; }
 
+bool VulkanImage::isDedicated() const { return m_dedicated; }
+
 const VulkanMemoryTypeList &VulkanImage::getMemoryTypeList() const
 {
     return m_memoryTypeList;
@@ -1645,14 +1797,14 @@
 
 VulkanImage2D::VulkanImage2D(
     const VulkanDevice &device, VulkanFormat format, uint32_t width,
-    uint32_t height, uint32_t numMipLevels,
+    uint32_t height, VulkanImageTiling imageTiling, uint32_t numMipLevels,
     VulkanExternalMemoryHandleType externalMemoryHandleType,
     VulkanImageCreateFlag imageCreateFlag, VulkanImageUsage imageUsage,
     VulkanSharingMode sharingMode)
     : VulkanImage(device, VULKAN_IMAGE_TYPE_2D, format,
                   VulkanExtent3D(width, height, 1), numMipLevels, 1,
-                  externalMemoryHandleType, imageCreateFlag,
-                  VULKAN_IMAGE_TILING_OPTIMAL, imageUsage, sharingMode)
+                  externalMemoryHandleType, imageCreateFlag, imageTiling,
+                  imageUsage, sharingMode)
 {}
 
 VulkanImage2D::~VulkanImage2D() {}
@@ -1839,7 +1991,8 @@
     const VulkanDevice &device, const VulkanImage &image,
     const VulkanMemoryType &memoryType,
     VulkanExternalMemoryHandleType externalMemoryHandleType, const void *name)
-    : m_device(device), m_size(image.getSize()), m_isDedicated(true)
+    : m_device(device), m_size(image.getSize()),
+      m_isDedicated(image.isDedicated())
 {
 #if defined(_WIN32) || defined(_WIN64)
     WindowsSecurityAttributes winSecurityAttributes;
@@ -1872,20 +2025,95 @@
     VkMemoryDedicatedAllocateInfo vkMemoryDedicatedAllocateInfo = {};
     vkMemoryDedicatedAllocateInfo.sType =
         VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO;
-    vkMemoryDedicatedAllocateInfo.pNext =
-        externalMemoryHandleType ? &vkExportMemoryAllocateInfoKHR : NULL;
+    vkMemoryDedicatedAllocateInfo.pNext = NULL;
     vkMemoryDedicatedAllocateInfo.image = image;
     vkMemoryDedicatedAllocateInfo.buffer = VK_NULL_HANDLE;
 
     VkMemoryAllocateInfo vkMemoryAllocateInfo = {};
     vkMemoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
-    vkMemoryAllocateInfo.pNext = &vkMemoryDedicatedAllocateInfo;
     vkMemoryAllocateInfo.allocationSize = m_size;
     vkMemoryAllocateInfo.memoryTypeIndex = (uint32_t)memoryType;
 
+    if (m_isDedicated)
+    {
+        vkMemoryAllocateInfo.pNext = &vkMemoryDedicatedAllocateInfo;
+        vkMemoryDedicatedAllocateInfo.pNext =
+            externalMemoryHandleType ? &vkExportMemoryAllocateInfoKHR : NULL;
+    }
+    else
+    {
+        vkMemoryAllocateInfo.pNext =
+            externalMemoryHandleType ? &vkExportMemoryAllocateInfoKHR : NULL;
+    }
+
     vkAllocateMemory(m_device, &vkMemoryAllocateInfo, NULL, &m_vkDeviceMemory);
 }
 
+VulkanDeviceMemory::VulkanDeviceMemory(
+    const VulkanDevice &device, const VulkanBuffer &buffer,
+    const VulkanMemoryType &memoryType,
+    VulkanExternalMemoryHandleType externalMemoryHandleType, const void *name)
+    : m_device(device), m_size(buffer.getSize()),
+      m_isDedicated(buffer.isDedicated())
+{
+#if defined(_WIN32) || defined(_WIN64)
+    WindowsSecurityAttributes winSecurityAttributes;
+
+    VkExportMemoryWin32HandleInfoKHR vkExportMemoryWin32HandleInfoKHR = {};
+    vkExportMemoryWin32HandleInfoKHR.sType =
+        VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR;
+    vkExportMemoryWin32HandleInfoKHR.pNext = NULL;
+    vkExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes;
+    vkExportMemoryWin32HandleInfoKHR.dwAccess =
+        DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE;
+    vkExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)name;
+
+#endif
+
+    VkExportMemoryAllocateInfoKHR vkExportMemoryAllocateInfoKHR = {};
+    vkExportMemoryAllocateInfoKHR.sType =
+        VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR;
+#if defined(_WIN32) || defined(_WIN64)
+    vkExportMemoryAllocateInfoKHR.pNext = externalMemoryHandleType
+            & VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT
+        ? &vkExportMemoryWin32HandleInfoKHR
+        : NULL;
+#else
+    vkExportMemoryAllocateInfoKHR.pNext = NULL;
+#endif
+    vkExportMemoryAllocateInfoKHR.handleTypes =
+        (VkExternalMemoryHandleTypeFlagsKHR)externalMemoryHandleType;
+
+    VkMemoryDedicatedAllocateInfo vkMemoryDedicatedAllocateInfo = {};
+    vkMemoryDedicatedAllocateInfo.sType =
+        VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO;
+    vkMemoryDedicatedAllocateInfo.pNext = NULL;
+    vkMemoryDedicatedAllocateInfo.image = VK_NULL_HANDLE;
+    vkMemoryDedicatedAllocateInfo.buffer = buffer;
+
+    VkMemoryAllocateInfo vkMemoryAllocateInfo = {};
+    vkMemoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    vkMemoryAllocateInfo.allocationSize = m_size;
+    vkMemoryAllocateInfo.memoryTypeIndex = (uint32_t)memoryType;
+
+    if (m_isDedicated)
+    {
+        vkMemoryAllocateInfo.pNext = &vkMemoryDedicatedAllocateInfo;
+        vkMemoryDedicatedAllocateInfo.pNext =
+            externalMemoryHandleType ? &vkExportMemoryAllocateInfoKHR : NULL;
+    }
+    else
+    {
+        vkMemoryAllocateInfo.pNext =
+            externalMemoryHandleType ? &vkExportMemoryAllocateInfoKHR : NULL;
+    }
+
+
+    VkResult res = vkAllocateMemory(m_device, &vkMemoryAllocateInfo, NULL,
+                                    &m_vkDeviceMemory);
+    ASSERT_SUCCESS(res, "Failed to allocate device memory");
+}
+
 VulkanDeviceMemory::~VulkanDeviceMemory()
 {
     vkFreeMemory(m_device, m_vkDeviceMemory, NULL);
@@ -1952,11 +2180,21 @@
 
 void VulkanDeviceMemory::bindBuffer(const VulkanBuffer &buffer, uint64_t offset)
 {
+    if (buffer.isDedicated() && !m_isDedicated)
+    {
+        throw std::runtime_error(
+            "Buffer requires dedicated memory.  Failed to bind");
+    }
     vkBindBufferMemory(m_device, buffer, m_vkDeviceMemory, offset);
 }
 
 void VulkanDeviceMemory::bindImage(const VulkanImage &image, uint64_t offset)
 {
+    if (image.isDedicated() && !m_isDedicated)
+    {
+        throw std::runtime_error(
+            "Image requires dedicated memory.  Failed to bind");
+    }
     vkBindImageMemory(m_device, image, m_vkDeviceMemory, offset);
 }
 
diff --git a/test_conformance/common/vulkan_wrapper/vulkan_wrapper.hpp b/test_conformance/common/vulkan_wrapper/vulkan_wrapper.hpp
index 37925ee..7fcc70f 100644
--- a/test_conformance/common/vulkan_wrapper/vulkan_wrapper.hpp
+++ b/test_conformance/common/vulkan_wrapper/vulkan_wrapper.hpp
@@ -21,6 +21,7 @@
 #include "vulkan_wrapper_types.hpp"
 #include "vulkan_list_map.hpp"
 #include "vulkan_api_list.hpp"
+#include <memory>
 
 class VulkanInstance {
     friend const VulkanInstance &getVulkanInstance();
@@ -145,6 +146,20 @@
     operator VkDevice() const;
 };
 
+class VulkanFence {
+    friend class VulkanQueue;
+
+protected:
+    VkFence fence;
+    VkDevice device;
+
+public:
+    VulkanFence(const VulkanDevice &device);
+    virtual ~VulkanFence();
+    void reset();
+    void wait();
+};
+
 class VulkanQueue {
     friend class VulkanDevice;
 
@@ -157,6 +172,8 @@
 
 public:
     const VulkanQueueFamily &getQueueFamily();
+    void submit(const VulkanCommandBuffer &commandBuffer,
+                const std::shared_ptr<VulkanFence> &fence);
     void submit(const VulkanSemaphoreList &waitSemaphoreList,
                 const VulkanCommandBufferList &commandBufferList,
                 const VulkanSemaphoreList &signalSemaphoreList);
@@ -311,7 +328,11 @@
                         const VulkanDescriptorSetLayout &descriptorSetLayout);
     virtual ~VulkanDescriptorSet();
     void update(uint32_t binding, const VulkanBuffer &buffer);
+    void updateArray(uint32_t binding, unsigned numBuffers,
+                     const VulkanBufferList &buffers);
     void update(uint32_t binding, const VulkanImageView &imageView);
+    void updateArray(uint32_t binding,
+                     const VulkanImageViewList &imageViewList);
     operator VkDescriptorSet() const;
 };
 
@@ -407,6 +428,7 @@
     VkBuffer m_vkBuffer;
     uint64_t m_size;
     uint64_t m_alignment;
+    bool m_dedicated;
     VulkanMemoryTypeList m_memoryTypeList;
 
     VulkanBuffer(const VulkanBuffer &buffer);
@@ -424,6 +446,7 @@
     uint64_t getSize() const;
     uint64_t getAlignment() const;
     const VulkanMemoryTypeList &getMemoryTypeList() const;
+    bool isDedicated() const;
     operator VkBuffer() const;
 };
 
@@ -435,6 +458,7 @@
     const VulkanFormat m_format;
     const uint32_t m_numMipLevels;
     const uint32_t m_numLayers;
+    bool m_dedicated;
     VkImage m_vkImage;
     uint64_t m_size;
     uint64_t m_alignment;
@@ -461,6 +485,7 @@
     uint32_t getNumLayers() const;
     uint64_t getSize() const;
     uint64_t getAlignment() const;
+    bool isDedicated() const;
     const VulkanMemoryTypeList &getMemoryTypeList() const;
     VkImageCreateInfo getVkImageCreateInfo() const;
     operator VkImage() const;
@@ -470,12 +495,11 @@
 protected:
     VkImageView m_vkImageView;
 
-    VulkanImage2D(const VulkanImage2D &image2D);
-
 public:
     VulkanImage2D(
         const VulkanDevice &device, VulkanFormat format, uint32_t width,
-        uint32_t height, uint32_t numMipLevels = 1,
+        uint32_t height, VulkanImageTiling imageTiling,
+        uint32_t numMipLevels = 1,
         VulkanExternalMemoryHandleType externalMemoryHandleType =
             VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE,
         VulkanImageCreateFlag imageCreateFlag = VULKAN_IMAGE_CREATE_FLAG_NONE,
@@ -484,6 +508,8 @@
         VulkanSharingMode sharingMode = VULKAN_SHARING_MODE_EXCLUSIVE);
     virtual ~VulkanImage2D();
     virtual VulkanExtent3D getExtent3D(uint32_t mipLevel = 0) const;
+
+    VulkanImage2D(const VulkanImage2D &image2D);
 };
 
 class VulkanImageView {
@@ -524,6 +550,11 @@
                        VulkanExternalMemoryHandleType externalMemoryHandleType =
                            VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE,
                        const void *name = NULL);
+    VulkanDeviceMemory(const VulkanDevice &device, const VulkanBuffer &buffer,
+                       const VulkanMemoryType &memoryType,
+                       VulkanExternalMemoryHandleType externalMemoryHandleType =
+                           VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE,
+                       const void *name = NULL);
     virtual ~VulkanDeviceMemory();
     uint64_t getSize() const;
 #ifdef _WIN32
@@ -569,7 +600,6 @@
     operator VkSemaphore() const;
 };
 
-
 #define VK_FUNC_DECL(name) extern "C" PFN_##name _##name;
 VK_FUNC_LIST
 #if defined(_WIN32) || defined(_WIN64)
diff --git a/test_conformance/common/vulkan_wrapper/vulkan_wrapper_types.hpp b/test_conformance/common/vulkan_wrapper/vulkan_wrapper_types.hpp
index 2473a1d..fcd1937 100644
--- a/test_conformance/common/vulkan_wrapper/vulkan_wrapper_types.hpp
+++ b/test_conformance/common/vulkan_wrapper/vulkan_wrapper_types.hpp
@@ -169,7 +169,9 @@
         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR,
     VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT_KMT =
         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR
-        | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR
+        | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR,
+    VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD =
+        VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT_KHR
 };
 
 enum VulkanBufferUsage
diff --git a/test_conformance/commonfns/main.cpp b/test_conformance/commonfns/main.cpp
index 3e4b0b8..645d3f7 100644
--- a/test_conformance/commonfns/main.cpp
+++ b/test_conformance/commonfns/main.cpp
@@ -1,6 +1,6 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
+// Copyright (c) 2023 The Khronos Group Inc.
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -18,8 +18,10 @@
 #include <string.h>
 #include "procs.h"
 #include "test_base.h"
+#include "harness/kernelHelpers.h"
 
 std::map<size_t, std::string> BaseFunctionTest::type2name;
+cl_half_rounding_mode BaseFunctionTest::halfRoundingMode = CL_HALF_RTE;
 
 int g_arrVecSizes[kVectorSizeCount + kStrangeVectorSizeCount];
 int g_arrStrangeVectorSizes[kStrangeVectorSizeCount] = {3};
@@ -45,17 +47,38 @@
 
 const int test_num = ARRAY_SIZE( test_list );
 
+test_status InitCL(cl_device_id device)
+{
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        const cl_device_fp_config fpConfigHalf =
+            get_default_rounding_mode(device, CL_DEVICE_HALF_FP_CONFIG);
+        if ((fpConfigHalf & CL_FP_ROUND_TO_NEAREST) != 0)
+        {
+            BaseFunctionTest::halfRoundingMode = CL_HALF_RTE;
+        }
+        else if ((fpConfigHalf & CL_FP_ROUND_TO_ZERO) != 0)
+        {
+            BaseFunctionTest::halfRoundingMode = CL_HALF_RTZ;
+        }
+        else
+        {
+            log_error("Error while acquiring half rounding mode");
+            return TEST_FAIL;
+        }
+    }
+
+    return TEST_PASS;
+}
+
 int main(int argc, const char *argv[])
 {
     initVecSizes();
 
-    if (BaseFunctionTest::type2name.empty())
-    {
-        BaseFunctionTest::type2name[sizeof(half)] = "half";
-        BaseFunctionTest::type2name[sizeof(float)] = "float";
-        BaseFunctionTest::type2name[sizeof(double)] = "double";
-    }
+    BaseFunctionTest::type2name[sizeof(half)] = "half";
+    BaseFunctionTest::type2name[sizeof(float)] = "float";
+    BaseFunctionTest::type2name[sizeof(double)] = "double";
 
-    return runTestHarness(argc, argv, test_num, test_list, false, 0);
+    return runTestHarnessWithCheck(argc, argv, test_num, test_list, false, 0,
+                                   InitCL);
 }
-
diff --git a/test_conformance/commonfns/test_base.h b/test_conformance/commonfns/test_base.h
index 4429104..be36ed2 100644
--- a/test_conformance/commonfns/test_base.h
+++ b/test_conformance/commonfns/test_base.h
@@ -19,27 +19,23 @@
 #include <vector>
 #include <map>
 #include <memory>
+#include <cmath>
 
 #include <CL/cl_half.h>
 #include <CL/cl_ext.h>
 
-#include "harness/deviceInfo.h"
 #include "harness/testHarness.h"
 #include "harness/typeWrappers.h"
 
-
 template <typename T>
 using VerifyFuncBinary = int (*)(const T *const, const T *const, const T *const,
                                  const int num, const int vs, const int vp);
 
-
 template <typename T>
 using VerifyFuncUnary = int (*)(const T *const, const T *const, const int num);
 
-
 using half = cl_half;
 
-
 struct BaseFunctionTest
 {
     BaseFunctionTest(cl_device_id device, cl_context context,
@@ -61,9 +57,9 @@
     bool vecParam;
 
     static std::map<size_t, std::string> type2name;
+    static cl_half_rounding_mode halfRoundingMode;
 };
 
-
 struct MinTest : BaseFunctionTest
 {
     MinTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -74,7 +70,6 @@
     cl_int Run() override;
 };
 
-
 struct MaxTest : BaseFunctionTest
 {
     MaxTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -85,7 +80,6 @@
     cl_int Run() override;
 };
 
-
 struct ClampTest : BaseFunctionTest
 {
     ClampTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -96,7 +90,6 @@
     cl_int Run() override;
 };
 
-
 struct DegreesTest : BaseFunctionTest
 {
     DegreesTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -107,7 +100,6 @@
     cl_int Run() override;
 };
 
-
 struct RadiansTest : BaseFunctionTest
 {
     RadiansTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -118,7 +110,6 @@
     cl_int Run() override;
 };
 
-
 struct SignTest : BaseFunctionTest
 {
     SignTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -129,7 +120,6 @@
     cl_int Run() override;
 };
 
-
 struct SmoothstepTest : BaseFunctionTest
 {
     SmoothstepTest(cl_device_id device, cl_context context,
@@ -141,7 +131,6 @@
     cl_int Run() override;
 };
 
-
 struct StepTest : BaseFunctionTest
 {
     StepTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -152,7 +141,6 @@
     cl_int Run() override;
 };
 
-
 struct MixTest : BaseFunctionTest
 {
     MixTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -163,19 +151,71 @@
     cl_int Run() override;
 };
 
-
-template <typename... Args>
-std::string string_format(const std::string &format, Args... args)
+template <typename T> float UlpFn(const T &val, const double &r)
 {
-    int sformat = std::snprintf(nullptr, 0, format.c_str(), args...) + 1;
-    if (sformat <= 0)
-        throw std::runtime_error("string_format: string processing error.");
-    auto format_size = static_cast<size_t>(sformat);
-    std::unique_ptr<char[]> buffer(new char[format_size]);
-    std::snprintf(buffer.get(), format_size, format.c_str(), args...);
-    return std::string(buffer.get(), buffer.get() + format_size - 1);
+    if (std::is_same<T, half>::value)
+    {
+        return Ulp_Error_Half(val, r);
+    }
+    else if (std::is_same<T, float>::value)
+    {
+        return Ulp_Error(val, r);
+    }
+    else if (std::is_same<T, double>::value)
+    {
+        return Ulp_Error_Double(val, r);
+    }
+    else
+    {
+        log_error("UlpFn: unsupported data type\n");
+    }
+
+    return -1.f; // wrong val
 }
 
+template <typename T> inline double conv_to_dbl(const T &val)
+{
+    if (std::is_same<T, half>::value)
+        return (double)cl_half_to_float(val);
+    else
+        return (double)val;
+}
+
+template <typename T> inline double conv_to_flt(const T &val)
+{
+    if (std::is_same<T, half>::value)
+        return (float)cl_half_to_float(val);
+    else
+        return (float)val;
+}
+
+template <typename T> inline half conv_to_half(const T &val)
+{
+    if (std::is_floating_point<T>::value)
+        return cl_half_from_float(val, BaseFunctionTest::halfRoundingMode);
+    return 0;
+}
+
+template <typename T> bool isfinite_fp(const T &v)
+{
+    if (std::is_same<T, half>::value)
+    {
+        // Extract FP16 exponent and mantissa
+        uint16_t h_exp = (((half)v) >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
+        uint16_t h_mant = ((half)v) & 0x3FF;
+
+        // !Inf test
+        return !(h_exp == 0x1F && h_mant == 0);
+    }
+    else
+    {
+#if !defined(_WIN32)
+        return std::isfinite(v);
+#else
+        return isfinite(v);
+#endif
+    }
+}
 
 template <class T>
 int MakeAndRunTest(cl_device_id device, cl_context context,
diff --git a/test_conformance/commonfns/test_binary_fn.cpp b/test_conformance/commonfns/test_binary_fn.cpp
index 1eb12f7..a6c7564 100644
--- a/test_conformance/commonfns/test_binary_fn.cpp
+++ b/test_conformance/commonfns/test_binary_fn.cpp
@@ -1,6 +1,6 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
+// Copyright (c) 2023 The Khronos Group Inc.
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -22,6 +22,7 @@
 
 #include "harness/deviceInfo.h"
 #include "harness/typeWrappers.h"
+#include "harness/stringHelpers.h"
 
 #include "procs.h"
 #include "test_base.h"
@@ -53,7 +54,6 @@
 "    vstore3(%s(vload3(tid,x), y[tid] ), tid, dst);\n"
 "}\n";
 
-
 template <typename T>
 int test_binary_fn(cl_device_id device, cl_context context,
                    cl_command_queue queue, int n_elems,
@@ -105,6 +105,16 @@
             input_ptr[1][j] = get_random_double(-0x20000000, 0x20000000, d);
         }
     }
+    else if (std::is_same<T, half>::value)
+    {
+        const float fval = CL_HALF_MAX;
+        pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+        for (int j = 0; j < num_elements; j++)
+        {
+            input_ptr[0][j] = conv_to_half(get_random_float(-fval, fval, d));
+            input_ptr[1][j] = conv_to_half(get_random_float(-fval, fval, d));
+        }
+    }
 
     for (i = 0; i < 2; i++)
     {
@@ -125,22 +135,22 @@
             {
                 std::string str = binary_fn_code_pattern_v3;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), fnName.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), fnName.c_str());
             }
             else
             {
                 std::string str = binary_fn_code_pattern_v3_scalar;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), fnName.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), fnName.c_str());
             }
         }
         else
         {
             // do regular
             std::string str = binary_fn_code_pattern;
-            kernelSource = string_format(
+            kernelSource = str_sprintf(
                 str, pragma_str.c_str(), tname.c_str(), vecSizeNames[i],
                 tname.c_str(), vecSecParam ? vecSizeNames[i] : "",
                 tname.c_str(), vecSizeNames[i], fnName.c_str());
@@ -203,13 +213,20 @@
         {
             int k = i * vecSize + j;
             int l = (k * vecParam + i * (1 - vecParam));
-            T v = (x[k] < y[l]) ? y[l] : x[k];
+            T v = (conv_to_dbl(x[k]) < conv_to_dbl(y[l])) ? y[l] : x[k];
             if (v != out[k])
             {
-                log_error(
-                    "x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. (index %d is "
-                    "vector %d, element %d, for vector size %d)\n",
-                    k, x[k], l, y[l], k, out[k], v, k, i, j, vecSize);
+                if (std::is_same<T, half>::value)
+                    log_error("x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. "
+                              "(index %d is "
+                              "vector %d, element %d, for vector size %d)\n",
+                              k, conv_to_flt(x[k]), l, conv_to_flt(y[l]), k,
+                              conv_to_flt(out[k]), v, k, i, j, vecSize);
+                else
+                    log_error("x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. "
+                              "(index %d is "
+                              "vector %d, element %d, for vector size %d)\n",
+                              k, x[k], l, y[l], k, out[k], v, k, i, j, vecSize);
                 return -1;
             }
         }
@@ -227,13 +244,20 @@
         {
             int k = i * vecSize + j;
             int l = (k * vecParam + i * (1 - vecParam));
-            T v = (x[k] > y[l]) ? y[l] : x[k];
+            T v = (conv_to_dbl(x[k]) > conv_to_dbl(y[l])) ? y[l] : x[k];
             if (v != out[k])
             {
-                log_error(
-                    "x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. (index %d is "
-                    "vector %d, element %d, for vector size %d)\n",
-                    k, x[k], l, y[l], k, out[k], v, k, i, j, vecSize);
+                if (std::is_same<T, half>::value)
+                    log_error("x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. "
+                              "(index %d is "
+                              "vector %d, element %d, for vector size %d)\n",
+                              k, conv_to_flt(x[k]), l, conv_to_flt(y[l]), k,
+                              conv_to_flt(out[k]), v, k, i, j, vecSize);
+                else
+                    log_error("x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. "
+                              "(index %d is "
+                              "vector %d, element %d, for vector size %d)\n",
+                              k, x[k], l, y[l], k, out[k], v, k, i, j, vecSize);
                 return -1;
             }
         }
@@ -246,6 +270,13 @@
 cl_int MaxTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_binary_fn<cl_half>(device, context, queue, num_elems,
+                                        fnName.c_str(), vecParam,
+                                        max_verify<cl_half>);
+        test_error(error, "MaxTest::Run<cl_half> failed");
+    }
 
     error = test_binary_fn<float>(device, context, queue, num_elems,
                                   fnName.c_str(), vecParam, max_verify<float>);
@@ -265,6 +296,13 @@
 cl_int MinTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_binary_fn<cl_half>(device, context, queue, num_elems,
+                                        fnName.c_str(), vecParam,
+                                        min_verify<cl_half>);
+        test_error(error, "MinTest::Run<cl_half> failed");
+    }
 
     error = test_binary_fn<float>(device, context, queue, num_elems,
                                   fnName.c_str(), vecParam, min_verify<float>);
diff --git a/test_conformance/commonfns/test_clamp.cpp b/test_conformance/commonfns/test_clamp.cpp
index 0e96fb6..1bf4067 100644
--- a/test_conformance/commonfns/test_clamp.cpp
+++ b/test_conformance/commonfns/test_clamp.cpp
@@ -26,12 +26,10 @@
 #include "procs.h"
 #include "test_base.h"
 
-
 #ifndef M_PI
 #define M_PI 3.14159265358979323846264338327950288
 #endif
 
-
 #define CLAMP_KERNEL(type)                                                     \
     const char *clamp_##type##_kernel_code = EMIT_PRAGMA_DIRECTIVE             \
         "__kernel void test_clamp(__global " #type " *x, __global " #type      \
@@ -64,6 +62,14 @@
         "vload3(tid,maxval)), tid, dst);\n"                                    \
         "}\n";
 
+#define EMIT_PRAGMA_DIRECTIVE "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+CLAMP_KERNEL(half)
+CLAMP_KERNEL_V(half, 2)
+CLAMP_KERNEL_V(half, 4)
+CLAMP_KERNEL_V(half, 8)
+CLAMP_KERNEL_V(half, 16)
+CLAMP_KERNEL_V3(half, 3)
+#undef EMIT_PRAGMA_DIRECTIVE
 
 #define EMIT_PRAGMA_DIRECTIVE " "
 CLAMP_KERNEL(float)
@@ -83,6 +89,10 @@
 CLAMP_KERNEL_V3(double, 3)
 #undef EMIT_PRAGMA_DIRECTIVE
 
+const char *clamp_half_codes[] = {
+    clamp_half_kernel_code,  clamp_half2_kernel_code,  clamp_half4_kernel_code,
+    clamp_half8_kernel_code, clamp_half16_kernel_code, clamp_half3_kernel_code
+};
 const char *clamp_float_codes[] = {
     clamp_float_kernel_code,   clamp_float2_kernel_code,
     clamp_float4_kernel_code,  clamp_float8_kernel_code,
@@ -96,21 +106,42 @@
 
 namespace {
 
-
 template <typename T>
 int verify_clamp(const T *const x, const T *const minval, const T *const maxval,
                  const T *const outptr, int n)
 {
-    T t;
-    for (int i = 0; i < n; i++)
+    if (std::is_same<T, half>::value)
     {
-        t = std::min(std::max(x[i], minval[i]), maxval[i]);
-        if (t != outptr[i])
+        float t;
+        for (int i = 0; i < n; i++)
         {
-            log_error(
-                "%d) verification error: clamp( %a, %a, %a) = *%a vs. %a\n", i,
-                x[i], minval[i], maxval[i], t, outptr[i]);
-            return -1;
+            t = std::min(
+                std::max(cl_half_to_float(x[i]), cl_half_to_float(minval[i])),
+                cl_half_to_float(maxval[i]));
+            if (t != cl_half_to_float(outptr[i]))
+            {
+                log_error(
+                    "%d) verification error: clamp( %a, %a, %a) = *%a vs. %a\n",
+                    i, cl_half_to_float(x[i]), cl_half_to_float(minval[i]),
+                    cl_half_to_float(maxval[i]), t,
+                    cl_half_to_float(outptr[i]));
+                return -1;
+            }
+        }
+    }
+    else
+    {
+        T t;
+        for (int i = 0; i < n; i++)
+        {
+            t = std::min(std::max(x[i], minval[i]), maxval[i]);
+            if (t != outptr[i])
+            {
+                log_error(
+                    "%d) verification error: clamp( %a, %a, %a) = *%a vs. %a\n",
+                    i, x[i], minval[i], maxval[i], t, outptr[i]);
+                return -1;
+            }
         }
     }
 
@@ -118,7 +149,6 @@
 }
 }
 
-
 template <typename T>
 int test_clamp_fn(cl_device_id device, cl_context context,
                   cl_command_queue queue, int n_elems)
@@ -169,6 +199,17 @@
             input_ptr[2][j] = get_random_double(input_ptr[1][j], 0x20000000, d);
         }
     }
+    else if (std::is_same<T, half>::value)
+    {
+        const float fval = CL_HALF_MAX;
+        for (j = 0; j < num_elements; j++)
+        {
+            input_ptr[0][j] = conv_to_half(get_random_float(-fval, fval, d));
+            input_ptr[1][j] = conv_to_half(get_random_float(-fval, fval, d));
+            input_ptr[2][j] = conv_to_half(
+                get_random_float(conv_to_flt(input_ptr[1][j]), fval, d));
+        }
+    }
 
     for (i = 0; i < 3; i++)
     {
@@ -194,9 +235,16 @@
                 "test_clamp");
             test_error(err, "Unable to create kernel");
         }
+        else if (std::is_same<T, half>::value)
+        {
+            err = create_single_kernel_helper(
+                context, &programs[i], &kernels[i], 1, &clamp_half_codes[i],
+                "test_clamp");
+            test_error(err, "Unable to create kernel");
+        }
 
-        log_info("Just made a program for float, i=%d, size=%d, in slot %d\n",
-                 i, g_arrVecSizes[i], i);
+        log_info("Just made a program for %s, i=%d, size=%d, in slot %d\n",
+                 tname.c_str(), i, g_arrVecSizes[i], i);
         fflush(stdout);
 
         for (j = 0; j < 4; j++)
@@ -239,10 +287,14 @@
     return err;
 }
 
-
 cl_int ClampTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_clamp_fn<cl_half>(device, context, queue, num_elems);
+        test_error(error, "ClampTest::Run<cl_half> failed");
+    }
 
     error = test_clamp_fn<float>(device, context, queue, num_elems);
     test_error(error, "ClampTest::Run<float> failed");
@@ -256,7 +308,6 @@
     return error;
 }
 
-
 int test_clamp(cl_device_id device, cl_context context, cl_command_queue queue,
                int n_elems)
 {
diff --git a/test_conformance/commonfns/test_mix.cpp b/test_conformance/commonfns/test_mix.cpp
index 92c1010..2a06e43 100644
--- a/test_conformance/commonfns/test_mix.cpp
+++ b/test_conformance/commonfns/test_mix.cpp
@@ -18,6 +18,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include "harness/stringHelpers.h"
+
 #include "procs.h"
 #include "test_base.h"
 
@@ -52,33 +54,42 @@
     "    vstore3(mix(vload3(tid, x), vload3(tid, y), a[tid]), tid, dst);\n"
     "}\n";
 
-
 #define MAX_ERR 1e-3
 
 namespace {
 
-
 template <typename T>
 int verify_mix(const T *const inptrX, const T *const inptrY,
                const T *const inptrA, const T *const outptr, const int n,
                const int veclen, const bool vecParam)
 {
-    T r;
-    float delta = 0.0f;
+    double r, o;
+    float delta = 0.f, max_delta = 0.f;
     int i;
 
     if (vecParam)
     {
         for (i = 0; i < n * veclen; i++)
         {
-            r = inptrX[i] + ((inptrY[i] - inptrX[i]) * inptrA[i]);
-            delta = fabs(double(r - outptr[i])) / r;
-            if (delta > MAX_ERR)
+            r = conv_to_dbl(inptrX[i])
+                + ((conv_to_dbl(inptrY[i]) - conv_to_dbl(inptrX[i]))
+                   * conv_to_dbl(inptrA[i]));
+
+            o = conv_to_dbl(outptr[i]);
+            delta = fabs(double(r - o)) / r;
+            if (!std::is_same<T, half>::value)
             {
-                log_error(
-                    "%d) verification error: mix(%a, %a, %a) = *%a vs. %a\n", i,
-                    inptrX[i], inptrY[i], inptrA[i], r, outptr[i]);
-                return -1;
+                if (delta > MAX_ERR)
+                {
+                    log_error("%d) verification error: mix(%a, %a, %a) = *%a "
+                              "vs. %a\n",
+                              i, inptrX[i], inptrY[i], inptrA[i], r, outptr[i]);
+                    return -1;
+                }
+            }
+            else
+            {
+                max_delta = std::max(max_delta, delta);
             }
         }
     }
@@ -90,25 +101,40 @@
             int vi = i * veclen;
             for (int j = 0; j < veclen; ++j, ++vi)
             {
-                r = inptrX[vi] + ((inptrY[vi] - inptrX[vi]) * inptrA[i]);
-                delta = fabs(double(r - outptr[vi])) / r;
-                if (delta > MAX_ERR)
+                r = conv_to_dbl(inptrX[vi])
+                    + ((conv_to_dbl(inptrY[vi]) - conv_to_dbl(inptrX[vi]))
+                       * conv_to_dbl(inptrA[i]));
+                delta = fabs(double(r - conv_to_dbl(outptr[vi]))) / r;
+                if (!std::is_same<T, half>::value)
                 {
-                    log_error("{%d, element %d}) verification error: mix(%a, "
-                              "%a, %a) = *%a vs. %a\n",
-                              ii, j, inptrX[vi], inptrY[vi], inptrA[i], r,
-                              outptr[vi]);
-                    return -1;
+                    if (delta > MAX_ERR)
+                    {
+                        log_error(
+                            "{%d, element %d}) verification error: mix(%a, "
+                            "%a, %a) = *%a vs. %a\n",
+                            ii, j, inptrX[vi], inptrY[vi], inptrA[i], r,
+                            outptr[vi]);
+                        return -1;
+                    }
+                }
+                else
+                {
+                    max_delta = std::max(max_delta, delta);
                 }
             }
         }
     }
 
+    // due to the fact that accuracy of mix for cl_khr_fp16 is implementation
+    // defined this test only reports maximum error without testing maximum
+    // error threshold
+    if (std::is_same<T, half>::value)
+        log_error("mix half verification result, max delta: %a\n", max_delta);
+
     return 0;
 }
 } // namespace
 
-
 template <typename T>
 int test_mix_fn(cl_device_id device, cl_context context, cl_command_queue queue,
                 int n_elems, bool vecParam)
@@ -120,7 +146,7 @@
     std::vector<clKernelWrapper> kernels;
 
     int err, i;
-    MTdataHolder d = MTdataHolder(gRandomSeed);
+    MTdataHolder d(gRandomSeed);
 
     assert(BaseFunctionTest::type2name.find(sizeof(T))
            != BaseFunctionTest::type2name.end());
@@ -142,19 +168,32 @@
         test_error(err, "clCreateBuffer failed");
     }
 
-    for (i = 0; i < num_elements; i++)
-    {
-        input_ptr[0][i] = (T)genrand_real1(d);
-        input_ptr[1][i] = (T)genrand_real1(d);
-        input_ptr[2][i] = (T)genrand_real1(d);
-    }
-
     std::string pragma_str;
     if (std::is_same<T, double>::value)
     {
         pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
     }
 
+    if (std::is_same<T, half>::value)
+    {
+        pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+        for (i = 0; i < num_elements; i++)
+        {
+            input_ptr[0][i] = conv_to_half((float)genrand_real1(d));
+            input_ptr[1][i] = conv_to_half((float)genrand_real1(d));
+            input_ptr[2][i] = conv_to_half((float)genrand_real1(d));
+        }
+    }
+    else
+    {
+        for (i = 0; i < num_elements; i++)
+        {
+            input_ptr[0][i] = (T)genrand_real1(d);
+            input_ptr[1][i] = (T)genrand_real1(d);
+            input_ptr[2][i] = (T)genrand_real1(d);
+        }
+    }
+
     for (i = 0; i < 3; i++)
     {
         err = clEnqueueWriteBuffer(queue, streams[i], CL_TRUE, 0,
@@ -164,7 +203,6 @@
     }
 
     char vecSizeNames[][3] = { "", "2", "4", "8", "16", "3" };
-
     for (i = 0; i < kTotalVecCount; i++)
     {
         std::string kernelSource;
@@ -174,15 +212,15 @@
             {
                 std::string str = mix_fn_code_pattern_v3;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), tname.c_str());
             }
             else
             {
                 std::string str = mix_fn_code_pattern_v3_scalar;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), tname.c_str());
             }
         }
         else
@@ -190,10 +228,10 @@
             // regular path
             std::string str = mix_fn_code_pattern;
             kernelSource =
-                string_format(str, pragma_str.c_str(), tname.c_str(),
-                              vecSizeNames[i], tname.c_str(), vecSizeNames[i],
-                              tname.c_str(), vecParam ? vecSizeNames[i] : "",
-                              tname.c_str(), vecSizeNames[i]);
+                str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                            vecSizeNames[i], tname.c_str(), vecSizeNames[i],
+                            tname.c_str(), vecParam ? vecSizeNames[i] : "",
+                            tname.c_str(), vecSizeNames[i]);
         }
         const char *programPtr = kernelSource.c_str();
         err =
@@ -242,10 +280,14 @@
     return err;
 }
 
-
 cl_int MixTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_mix_fn<half>(device, context, queue, num_elems, vecParam);
+        test_error(error, "MixTest::Run<cl_half> failed");
+    }
 
     error = test_mix_fn<float>(device, context, queue, num_elems, vecParam);
     test_error(error, "MixTest::Run<float> failed");
@@ -260,7 +302,6 @@
     return error;
 }
 
-
 int test_mix(cl_device_id device, cl_context context, cl_command_queue queue,
              int n_elems)
 {
@@ -268,7 +309,6 @@
                                    true);
 }
 
-
 int test_mixf(cl_device_id device, cl_context context, cl_command_queue queue,
               int n_elems)
 {
diff --git a/test_conformance/commonfns/test_smoothstep.cpp b/test_conformance/commonfns/test_smoothstep.cpp
index 31948d3..5afc2d0 100644
--- a/test_conformance/commonfns/test_smoothstep.cpp
+++ b/test_conformance/commonfns/test_smoothstep.cpp
@@ -18,10 +18,11 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include "harness/stringHelpers.h"
+
 #include "procs.h"
 #include "test_base.h"
 
-
 const char *smoothstep_fn_code_pattern =
     "%s\n" /* optional pragma */
     "__kernel void test_fn(__global %s%s *e0, __global %s%s *e1, __global %s%s "
@@ -53,38 +54,43 @@
     "    vstore3(smoothstep(e0[tid], e1[tid], vload3(tid,x)), tid, dst);\n"
     "}\n";
 
-
 #define MAX_ERR (1e-5f)
 
 namespace {
 
-
 template <typename T>
 int verify_smoothstep(const T *const edge0, const T *const edge1,
                       const T *const x, const T *const outptr, const int n,
                       const int veclen, const bool vecParam)
 {
-    T r, t;
-    float delta = 0;
+    double r, t;
+    float delta = 0, max_delta = 0;
 
     if (vecParam)
     {
         for (int i = 0; i < n * veclen; i++)
         {
-            t = (x[i] - edge0[i]) / (edge1[i] - edge0[i]);
-            if (t < 0.0f)
-                t = 0.0f;
-            else if (t > 1.0f)
-                t = 1.0f;
-            r = t * t * (3.0f - 2.0f * t);
-            delta = (float)fabs(r - outptr[i]);
-            if (delta > MAX_ERR)
+            t = (conv_to_dbl(x[i]) - conv_to_dbl(edge0[i]))
+                / (conv_to_dbl(edge1[i]) - conv_to_dbl(edge0[i]));
+            if (t < 0.0)
+                t = 0.0;
+            else if (t > 1.0)
+                t = 1.0;
+            r = t * t * (3.0 - 2.0 * t);
+            delta = (float)fabs(r - conv_to_dbl(outptr[i]));
+            if (!std::is_same<T, half>::value)
             {
-                log_error("%d) verification error: smoothstep(%a, %a, %a) = "
-                          "*%a vs. %a\n",
-                          i, x[i], edge0[i], edge1[i], r, outptr[i]);
-                return -1;
+                if (delta > MAX_ERR)
+                {
+                    log_error(
+                        "%d) verification error: smoothstep(%a, %a, %a) = "
+                        "*%a vs. %a\n",
+                        i, x[i], edge0[i], edge1[i], r, outptr[i]);
+                    return -1;
+                }
             }
+            else
+                max_delta = std::max(max_delta, delta);
         }
     }
     else
@@ -95,32 +101,48 @@
             int vi = i * veclen;
             for (int j = 0; j < veclen; ++j, ++vi)
             {
-                t = (x[vi] - edge0[i]) / (edge1[i] - edge0[i]);
-                if (t < 0.0f)
-                    t = 0.0f;
-                else if (t > 1.0f)
-                    t = 1.0f;
-                r = t * t * (3.0f - 2.0f * t);
-                delta = (float)fabs(r - outptr[vi]);
-                if (delta > MAX_ERR)
+                t = (conv_to_dbl(x[vi]) - conv_to_dbl(edge0[i]))
+                    / (conv_to_dbl(edge1[i]) - conv_to_dbl(edge0[i]));
+                if (t < 0.0)
+                    t = 0.0;
+                else if (t > 1.0)
+                    t = 1.0;
+                r = t * t * (3.0 - 2.0 * t);
+                delta = (float)fabs(r - conv_to_dbl(outptr[vi]));
+
+                if (!std::is_same<T, half>::value)
                 {
-                    log_error("{%d, element %d}) verification error: "
-                              "smoothstep(%a, %a, %a) = *%a vs. %a\n",
-                              ii, j, x[vi], edge0[i], edge1[i], r, outptr[vi]);
-                    return -1;
+                    if (delta > MAX_ERR)
+                    {
+                        log_error("{%d, element %d}) verification error: "
+                                  "smoothstep(%a, %a, %a) = *%a vs. %a\n",
+                                  ii, j, x[vi], edge0[i], edge1[i], r,
+                                  outptr[vi]);
+                        return -1;
+                    }
                 }
+                else
+                    max_delta = std::max(max_delta, delta);
             }
         }
     }
+
+    // due to the fact that accuracy of smoothstep for cl_khr_fp16 is
+    // implementation defined this test only reports maximum error without
+    // testing maximum error threshold
+    if (std::is_same<T, half>::value)
+        log_error("smoothstep half verification result, max delta: %a\n",
+                  max_delta);
+
     return 0;
 }
 
 }
 
-
 template <typename T>
 int test_smoothstep_fn(cl_device_id device, cl_context context,
-                       cl_command_queue queue, int n_elems, bool vecParam)
+                       cl_command_queue queue, const int n_elems,
+                       const bool vecParam)
 {
     clMemWrapper streams[4];
     std::vector<T> input_ptr[3], output_ptr;
@@ -170,6 +192,17 @@
             input_ptr[2][i] = get_random_double(-0x20000000, 0x20000000, d);
         }
     }
+    else if (std::is_same<T, half>::value)
+    {
+        pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+        for (i = 0; i < num_elements; i++)
+        {
+            input_ptr[0][i] = conv_to_half(get_random_float(-65503, 65503, d));
+            input_ptr[1][i] = conv_to_half(
+                get_random_float(conv_to_flt(input_ptr[0][i]), 65503, d));
+            input_ptr[2][i] = conv_to_half(get_random_float(-65503, 65503, d));
+        }
+    }
 
     for (i = 0; i < 3; i++)
     {
@@ -179,7 +212,7 @@
         test_error(err, "Unable to write input buffer");
     }
 
-    char vecSizeNames[][3] = { "", "2", "4", "8", "16", "3" };
+    const char vecSizeNames[][3] = { "", "2", "4", "8", "16", "3" };
 
     for (i = 0; i < kTotalVecCount; i++)
     {
@@ -190,15 +223,15 @@
             {
                 std::string str = smoothstep_fn_code_pattern_v3;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), tname.c_str());
             }
             else
             {
                 std::string str = smoothstep_fn_code_pattern_v3_scalar;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), tname.c_str());
             }
         }
         else
@@ -206,11 +239,12 @@
             // regular path
             std::string str = smoothstep_fn_code_pattern;
             kernelSource =
-                string_format(str, pragma_str.c_str(), tname.c_str(),
-                              vecParam ? vecSizeNames[i] : "", tname.c_str(),
-                              vecParam ? vecSizeNames[i] : "", tname.c_str(),
-                              vecSizeNames[i], tname.c_str(), vecSizeNames[i]);
+                str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                            vecParam ? vecSizeNames[i] : "", tname.c_str(),
+                            vecParam ? vecSizeNames[i] : "", tname.c_str(),
+                            vecSizeNames[i], tname.c_str(), vecSizeNames[i]);
         }
+
         const char *programPtr = kernelSource.c_str();
         err =
             create_single_kernel_helper(context, &programs[i], &kernels[i], 1,
@@ -259,10 +293,15 @@
     return err;
 }
 
-
 cl_int SmoothstepTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_smoothstep_fn<half>(device, context, queue, num_elems,
+                                         vecParam);
+        test_error(error, "SmoothstepTest::Run<cl_half> failed");
+    }
 
     error =
         test_smoothstep_fn<float>(device, context, queue, num_elems, vecParam);
@@ -278,7 +317,6 @@
     return error;
 }
 
-
 int test_smoothstep(cl_device_id device, cl_context context,
                     cl_command_queue queue, int n_elems)
 {
@@ -286,7 +324,6 @@
                                           "smoothstep", true);
 }
 
-
 int test_smoothstepf(cl_device_id device, cl_context context,
                      cl_command_queue queue, int n_elems)
 {
diff --git a/test_conformance/commonfns/test_step.cpp b/test_conformance/commonfns/test_step.cpp
index dc91766..1cfa96e 100644
--- a/test_conformance/commonfns/test_step.cpp
+++ b/test_conformance/commonfns/test_step.cpp
@@ -18,10 +18,11 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include "harness/stringHelpers.h"
+
 #include "procs.h"
 #include "test_base.h"
 
-
 const char *step_fn_code_pattern = "%s\n" /* optional pragma */
                                    "__kernel void test_fn(__global %s%s *edge, "
                                    "__global %s%s *x, __global %s%s *dst)\n"
@@ -48,7 +49,6 @@
     "    vstore3(step(edge[tid], vload3(tid,x)), tid, dst);\n"
     "}\n";
 
-
 namespace {
 
 template <typename T>
@@ -62,8 +62,8 @@
     {
         for (int i = 0; i < n * veclen; i++)
         {
-            r = (inptrB[i] < inptrA[i]) ? 0.0 : 1.0;
-            if (r != outptr[i]) return -1;
+            r = (conv_to_dbl(inptrB[i]) < conv_to_dbl(inptrA[i])) ? 0.0 : 1.0;
+            if (r != conv_to_dbl(outptr[i])) return -1;
         }
     }
     else
@@ -73,24 +73,31 @@
             int ii = i / veclen;
             for (int j = 0; j < veclen && i < n; ++j, ++i)
             {
-                r = (inptrB[i] < inptrA[ii]) ? 0.0f : 1.0f;
-                if (r != outptr[i])
+                r = (conv_to_dbl(inptrB[i]) < conv_to_dbl(inptrA[ii])) ? 0.0f
+                                                                       : 1.0f;
+                if (r != conv_to_dbl(outptr[i]))
                 {
-                    log_error("Failure @ {%d, element %d}: step(%a,%a) -> *%a "
-                              "vs %a\n",
-                              ii, j, inptrA[ii], inptrB[i], r, outptr[i]);
+                    if (std::is_same<T, half>::value)
+                        log_error(
+                            "Failure @ {%d, element %d}: step(%a,%a) -> *%a "
+                            "vs %a\n",
+                            ii, j, conv_to_flt(inptrA[ii]),
+                            conv_to_flt(inptrB[i]), r, conv_to_flt(outptr[i]));
+                    else
+                        log_error(
+                            "Failure @ {%d, element %d}: step(%a,%a) -> *%a "
+                            "vs %a\n",
+                            ii, j, inptrA[ii], inptrB[i], r, outptr[i]);
                     return -1;
                 }
             }
         }
     }
-
     return 0;
 }
 
 }
 
-
 template <typename T>
 int test_step_fn(cl_device_id device, cl_context context,
                  cl_command_queue queue, int n_elems, bool vecParam)
@@ -140,6 +147,16 @@
             input_ptr[1][i] = get_random_double(-0x40000000, 0x40000000, d);
         }
     }
+    else if (std::is_same<T, half>::value)
+    {
+        const float fval = CL_HALF_MAX;
+        pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+        for (i = 0; i < num_elements; i++)
+        {
+            input_ptr[0][i] = conv_to_half(get_random_float(-fval, fval, d));
+            input_ptr[1][i] = conv_to_half(get_random_float(-fval, fval, d));
+        }
+    }
 
     for (i = 0; i < 2; i++)
     {
@@ -160,15 +177,15 @@
             {
                 std::string str = step_fn_code_pattern_v3;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str());
             }
             else
             {
                 std::string str = step_fn_code_pattern_v3_scalar;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str());
             }
         }
         else
@@ -176,9 +193,9 @@
             // regular path
             std::string str = step_fn_code_pattern;
             kernelSource =
-                string_format(str, pragma_str.c_str(), tname.c_str(),
-                              vecParam ? vecSizeNames[i] : "", tname.c_str(),
-                              vecSizeNames[i], tname.c_str(), vecSizeNames[i]);
+                str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                            vecParam ? vecSizeNames[i] : "", tname.c_str(),
+                            vecSizeNames[i], tname.c_str(), vecSizeNames[i]);
         }
         const char *programPtr = kernelSource.c_str();
         err =
@@ -229,10 +246,14 @@
     return err;
 }
 
-
 cl_int StepTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_step_fn<half>(device, context, queue, num_elems, vecParam);
+        test_error(error, "StepTest::Run<cl_half> failed");
+    }
 
     error = test_step_fn<float>(device, context, queue, num_elems, vecParam);
     test_error(error, "StepTest::Run<float> failed");
@@ -247,7 +268,6 @@
     return error;
 }
 
-
 int test_step(cl_device_id device, cl_context context, cl_command_queue queue,
               int n_elems)
 {
@@ -255,7 +275,6 @@
                                     true);
 }
 
-
 int test_stepf(cl_device_id device, cl_context context, cl_command_queue queue,
                int n_elems)
 {
diff --git a/test_conformance/commonfns/test_unary_fn.cpp b/test_conformance/commonfns/test_unary_fn.cpp
index fed4389..91b5c21 100644
--- a/test_conformance/commonfns/test_unary_fn.cpp
+++ b/test_conformance/commonfns/test_unary_fn.cpp
@@ -21,6 +21,7 @@
 #include <vector>
 
 #include "harness/deviceInfo.h"
+#include "harness/stringHelpers.h"
 #include "harness/typeWrappers.h"
 
 #include "procs.h"
@@ -30,7 +31,6 @@
 #define M_PI 3.14159265358979323846264338327950288
 #endif
 
-
 // clang-format off
 const char *unary_fn_code_pattern =
 "%s\n" /* optional pragma */
@@ -51,23 +51,10 @@
 "}\n";
 // clang-format on
 
-
 #define MAX_ERR 2.0f
 
 namespace {
 
-
-template <typename T> float UlpFn(const T &val, const double &r)
-{
-    if (std::is_same<T, double>::value)
-        return Ulp_Error_Double(val, r);
-    else if (std::is_same<T, float>::value)
-        return Ulp_Error(val, r);
-    else if (std::is_same<T, half>::value)
-        return Ulp_Error(val, r);
-}
-
-
 template <typename T>
 int verify_degrees(const T *const inptr, const T *const outptr, int n)
 {
@@ -77,7 +64,11 @@
 
     for (int i = 0, j = 0; i < n; i++, j++)
     {
-        r = (180.0 / M_PI) * inptr[i];
+        r = (180.0 / M_PI) * conv_to_dbl(inptr[i]);
+
+        if (std::is_same<T, half>::value)
+            if (!isfinite_fp(conv_to_half(r)) && !isfinite_fp(outptr[i]))
+                continue;
 
         error = UlpFn(outptr[i], r);
 
@@ -88,21 +79,32 @@
             max_val = r;
             if (fabsf(error) > MAX_ERR)
             {
-                log_error("%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n",
-                          i, inptr[i], r, outptr[i], r, outptr[i], error);
+                if (std::is_same<T, half>::value)
+                    log_error(
+                        "%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n", i,
+                        conv_to_flt(inptr[i]), r, conv_to_flt(outptr[i]), r,
+                        conv_to_flt(outptr[i]), error);
+                else
+                    log_error(
+                        "%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n", i,
+                        inptr[i], r, outptr[i], r, outptr[i], error);
                 return 1;
             }
         }
     }
 
-    log_info("degrees: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
-             max_error, max_index, max_val, outptr[max_index], max_val,
-             outptr[max_index]);
+    if (std::is_same<T, half>::value)
+        log_info("degrees: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
+                 max_error, max_index, max_val, conv_to_flt(outptr[max_index]),
+                 max_val, conv_to_flt(outptr[max_index]));
+    else
+        log_info("degrees: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
+                 max_error, max_index, max_val, outptr[max_index], max_val,
+                 outptr[max_index]);
 
     return 0;
 }
 
-
 template <typename T>
 int verify_radians(const T *const inptr, const T *const outptr, int n)
 {
@@ -112,8 +114,14 @@
 
     for (int i = 0, j = 0; i < n; i++, j++)
     {
-        r = (M_PI / 180.0) * inptr[i];
-        error = Ulp_Error(outptr[i], r);
+        r = (M_PI / 180.0) * conv_to_dbl(inptr[i]);
+
+        if (std::is_same<T, half>::value)
+            if (!isfinite_fp(conv_to_half(r)) && !isfinite_fp(outptr[i]))
+                continue;
+
+        error = UlpFn(outptr[i], r);
+
         if (fabsf(error) > max_error)
         {
             max_error = error;
@@ -121,41 +129,51 @@
             max_val = r;
             if (fabsf(error) > MAX_ERR)
             {
-                log_error("%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n",
-                          i, inptr[i], r, outptr[i], r, outptr[i], error);
+                if (std::is_same<T, half>::value)
+                    log_error(
+                        "%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n", i,
+                        conv_to_flt(inptr[i]), r, conv_to_flt(outptr[i]), r,
+                        conv_to_flt(outptr[i]), error);
+                else
+                    log_error(
+                        "%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n", i,
+                        inptr[i], r, outptr[i], r, outptr[i], error);
                 return 1;
             }
         }
     }
 
-    log_info("radians: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
-             max_error, max_index, max_val, outptr[max_index], max_val,
-             outptr[max_index]);
+    if (std::is_same<T, half>::value)
+        log_info("radians: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
+                 max_error, max_index, max_val, conv_to_flt(outptr[max_index]),
+                 max_val, conv_to_flt(outptr[max_index]));
+    else
+        log_info("radians: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
+                 max_error, max_index, max_val, outptr[max_index], max_val,
+                 outptr[max_index]);
 
     return 0;
 }
 
-
 template <typename T>
 int verify_sign(const T *const inptr, const T *const outptr, int n)
 {
-    T r = 0;
+    double r = 0;
     for (int i = 0; i < n; i++)
     {
-        if (inptr[i] > 0.0f)
+        if (conv_to_dbl(inptr[i]) > 0.0f)
             r = 1.0;
-        else if (inptr[i] < 0.0f)
+        else if (conv_to_dbl(inptr[i]) < 0.0f)
             r = -1.0;
         else
             r = 0.0;
-        if (r != outptr[i]) return -1;
+        if (r != conv_to_dbl(outptr[i])) return -1;
     }
     return 0;
 }
 
 }
 
-
 template <typename T>
 int test_unary_fn(cl_device_id device, cl_context context,
                   cl_command_queue queue, int n_elems,
@@ -207,33 +225,38 @@
                 get_random_double(-100000.0 * M_PI, 100000.0 * M_PI, d);
         }
     }
+    else if (std::is_same<T, half>::value)
+    {
+        pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+        for (int j = 0; j < num_elements; j++)
+        {
+            input_ptr[j] = conv_to_half(get_random_float(
+                (float)(-10000.f * M_PI), (float)(10000.f * M_PI), d));
+        }
+    }
 
     err = clEnqueueWriteBuffer(queue, streams[0], true, 0,
                                sizeof(T) * num_elements, &input_ptr.front(), 0,
                                NULL, NULL);
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueWriteBuffer failed\n");
-        return -1;
-    }
+    test_error(err, "clEnqueueWriteBuffer failed\n");
 
     for (i = 0; i < kTotalVecCount; i++)
     {
         std::string kernelSource;
-        char vecSizeNames[][3] = { "", "2", "4", "8", "16", "3" };
+        const char vecSizeNames[][3] = { "", "2", "4", "8", "16", "3" };
 
         if (i >= kVectorSizeCount)
         {
             std::string str = unary_fn_code_pattern_v3;
-            kernelSource = string_format(str, pragma_str.c_str(), tname.c_str(),
-                                         tname.c_str(), fnName.c_str());
+            kernelSource = str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                       tname.c_str(), fnName.c_str());
         }
         else
         {
             std::string str = unary_fn_code_pattern;
-            kernelSource = string_format(str, pragma_str.c_str(), tname.c_str(),
-                                         vecSizeNames[i], tname.c_str(),
-                                         vecSizeNames[i], fnName.c_str());
+            kernelSource = str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                       vecSizeNames[i], tname.c_str(),
+                                       vecSizeNames[i], fnName.c_str());
         }
 
         /* Create kernels */
@@ -290,11 +313,18 @@
     return err;
 }
 
-
 cl_int DegreesTest::Run()
 {
-    cl_int error = test_unary_fn<float>(device, context, queue, num_elems,
-                                        fnName.c_str(), verify_degrees<float>);
+    cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_unary_fn<half>(device, context, queue, num_elems,
+                                    fnName.c_str(), verify_degrees<half>);
+        test_error(error, "DegreesTest::Run<cl_half> failed");
+    }
+
+    error = test_unary_fn<float>(device, context, queue, num_elems,
+                                 fnName.c_str(), verify_degrees<float>);
     test_error(error, "DegreesTest::Run<float> failed");
 
     if (is_extension_available(device, "cl_khr_fp64"))
@@ -307,11 +337,18 @@
     return error;
 }
 
-
 cl_int RadiansTest::Run()
 {
-    cl_int error = test_unary_fn<float>(device, context, queue, num_elems,
-                                        fnName.c_str(), verify_radians<float>);
+    cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_unary_fn<half>(device, context, queue, num_elems,
+                                    fnName.c_str(), verify_radians<half>);
+        test_error(error, "RadiansTest::Run<cl_half> failed");
+    }
+
+    error = test_unary_fn<float>(device, context, queue, num_elems,
+                                 fnName.c_str(), verify_radians<float>);
     test_error(error, "RadiansTest::Run<float> failed");
 
     if (is_extension_available(device, "cl_khr_fp64"))
@@ -324,11 +361,18 @@
     return error;
 }
 
-
 cl_int SignTest::Run()
 {
-    cl_int error = test_unary_fn<float>(device, context, queue, num_elems,
-                                        fnName.c_str(), verify_sign<float>);
+    cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_unary_fn<half>(device, context, queue, num_elems,
+                                    fnName.c_str(), verify_sign<half>);
+        test_error(error, "SignTest::Run<cl_half> failed");
+    }
+
+    error = test_unary_fn<float>(device, context, queue, num_elems,
+                                 fnName.c_str(), verify_sign<float>);
     test_error(error, "SignTest::Run<float> failed");
 
     if (is_extension_available(device, "cl_khr_fp64"))
@@ -341,7 +385,6 @@
     return error;
 }
 
-
 int test_degrees(cl_device_id device, cl_context context,
                  cl_command_queue queue, int n_elems)
 {
@@ -349,7 +392,6 @@
                                        "degrees");
 }
 
-
 int test_radians(cl_device_id device, cl_context context,
                  cl_command_queue queue, int n_elems)
 {
@@ -357,7 +399,6 @@
                                        "radians");
 }
 
-
 int test_sign(cl_device_id device, cl_context context, cl_command_queue queue,
               int n_elems)
 {
diff --git a/test_conformance/compiler/test_compile.cpp b/test_conformance/compiler/test_compile.cpp
index f3ee431..3af8125 100644
--- a/test_conformance/compiler/test_compile.cpp
+++ b/test_conformance/compiler/test_compile.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -16,148 +16,147 @@
 #include "testBase.h"
 #if defined(_WIN32)
 #include <time.h>
-#elif  defined(__linux__) || defined(__APPLE__)
+#elif defined(__linux__) || defined(__APPLE__)
 #include <sys/time.h>
 #include <unistd.h>
 #endif
 #include "harness/conversions.h"
 
 #define MAX_LINE_SIZE_IN_PROGRAM 1024
-#define MAX_LOG_SIZE_IN_PROGRAM  2048
+#define MAX_LOG_SIZE_IN_PROGRAM 2048
 
 const char *sample_kernel_start =
-"__kernel void sample_test(__global float *src, __global int *dst)\n"
-"{\n"
-"    float temp;\n"
-"    int  tid = get_global_id(0);\n";
+    "__kernel void sample_test(__global float *src, __global int *dst)\n"
+    "{\n"
+    "    float temp = 0.0f;\n"
+    "    int  tid = get_global_id(0);\n";
 
 const char *sample_kernel_end = "}\n";
 
-const char *sample_kernel_lines[] = {
-"dst[tid] = src[tid];\n",
-"dst[tid] = src[tid] * 3.f;\n",
-"temp = src[tid] / 4.f;\n",
-"dst[tid] = dot(temp,src[tid]);\n",
-"dst[tid] = dst[tid] + temp;\n" };
+const char *sample_kernel_lines[] = { "dst[tid] = src[tid];\n",
+                                      "dst[tid] = src[tid] * 3.f;\n",
+                                      "temp = src[tid] / 4.f;\n",
+                                      "dst[tid] = dot(temp,src[tid]);\n",
+                                      "dst[tid] = dst[tid] + temp;\n" };
 
 /* I compile and link therefore I am. Robert Ioffe */
-/* The following kernels are used in testing Improved Compilation and Linking feature */
+/* The following kernels are used in testing Improved Compilation and Linking
+ * feature */
 
-const char *simple_kernel =
-"__kernel void\n"
-"CopyBuffer(\n"
-"    __global float* src,\n"
-"    __global float* dst )\n"
-"{\n"
-"    int id = (int)get_global_id(0);\n"
-"    dst[id] = src[id];\n"
-"}\n";
+const char *simple_kernel = "__kernel void\n"
+                            "CopyBuffer(\n"
+                            "    __global float* src,\n"
+                            "    __global float* dst )\n"
+                            "{\n"
+                            "    int id = (int)get_global_id(0);\n"
+                            "    dst[id] = src[id];\n"
+                            "}\n";
 
 const char *simple_kernel_with_defines =
-"__kernel void\n"
-"CopyBuffer(\n"
-"    __global float* src,\n"
-"    __global float* dst )\n"
-"{\n"
-"    int id = (int)get_global_id(0);\n"
-"    float temp = src[id] - 42;\n"
-"    dst[id] = FIRST + temp + SECOND;\n"
-"}\n";
+    "__kernel void\n"
+    "CopyBuffer(\n"
+    "    __global float* src,\n"
+    "    __global float* dst )\n"
+    "{\n"
+    "    int id = (int)get_global_id(0);\n"
+    "    float temp = src[id] - 42;\n"
+    "    dst[id] = FIRST + temp + SECOND;\n"
+    "}\n";
 
-const char *simple_kernel_template =
-"__kernel void\n"
-"CopyBuffer%d(\n"
-"    __global float* src,\n"
-"    __global float* dst )\n"
-"{\n"
-"    int id = (int)get_global_id(0);\n"
-"    dst[id] = src[id];\n"
-"}\n";
+const char *simple_kernel_template = "__kernel void\n"
+                                     "CopyBuffer%d(\n"
+                                     "    __global float* src,\n"
+                                     "    __global float* dst )\n"
+                                     "{\n"
+                                     "    int id = (int)get_global_id(0);\n"
+                                     "    dst[id] = src[id];\n"
+                                     "}\n";
 
-const char *composite_kernel_start =
-"__kernel void\n"
-"CompositeKernel(\n"
-"    __global float* src,\n"
-"    __global float* dst )\n"
-"{\n";
+const char *composite_kernel_start = "__kernel void\n"
+                                     "CompositeKernel(\n"
+                                     "    __global float* src,\n"
+                                     "    __global float* dst )\n"
+                                     "{\n";
 
 const char *composite_kernel_end = "}\n";
 
-const char *composite_kernel_template =
-"    CopyBuffer%d(src, dst);\n";
+const char *composite_kernel_template = "    CopyBuffer%d(src, dst);\n";
 
-const char *composite_kernel_extern_template =
-"extern __kernel void\n"
-"CopyBuffer%d(\n"
-"    __global float* src,\n"
-"    __global float* dst );\n";
+const char *composite_kernel_extern_template = "extern __kernel void\n"
+                                               "CopyBuffer%d(\n"
+                                               "    __global float* src,\n"
+                                               "    __global float* dst );\n";
 
-const char *another_simple_kernel =
-"extern __kernel void\n"
-"CopyBuffer(\n"
-"    __global float* src,\n"
-"    __global float* dst );\n"
-"__kernel void\n"
-"AnotherCopyBuffer(\n"
-"    __global float* src,\n"
-"    __global float* dst )\n"
-"{\n"
-"    CopyBuffer(src, dst);\n"
-"}\n";
+const char *another_simple_kernel = "extern __kernel void\n"
+                                    "CopyBuffer(\n"
+                                    "    __global float* src,\n"
+                                    "    __global float* dst );\n"
+                                    "__kernel void\n"
+                                    "AnotherCopyBuffer(\n"
+                                    "    __global float* src,\n"
+                                    "    __global float* dst )\n"
+                                    "{\n"
+                                    "    CopyBuffer(src, dst);\n"
+                                    "}\n";
 
-const char* simple_header =
-"extern __kernel void\n"
-"CopyBuffer(\n"
-"    __global float* src,\n"
-"    __global float* dst );\n";
+const char *simple_header = "extern __kernel void\n"
+                            "CopyBuffer(\n"
+                            "    __global float* src,\n"
+                            "    __global float* dst );\n";
 
-const char* simple_header_name = "simple_header.h";
+const char *simple_header_name = "simple_header.h";
 
-const char* another_simple_kernel_with_header =
-"#include \"simple_header.h\"\n"
-"__kernel void\n"
-"AnotherCopyBuffer(\n"
-"    __global float* src,\n"
-"    __global float* dst )\n"
-"{\n"
-"    CopyBuffer(src, dst);\n"
-"}\n";
+const char *another_simple_kernel_with_header = "#include \"simple_header.h\"\n"
+                                                "__kernel void\n"
+                                                "AnotherCopyBuffer(\n"
+                                                "    __global float* src,\n"
+                                                "    __global float* dst )\n"
+                                                "{\n"
+                                                "    CopyBuffer(src, dst);\n"
+                                                "}\n";
 
-const char* header_name_templates[4]   = { "simple_header%d.h",
-                                           "foo/simple_header%d.h",
-                                           "foo/bar/simple_header%d.h",
-                                           "foo/bar/baz/simple_header%d.h"};
+const char *header_name_templates[4] = { "simple_header%d.h",
+                                         "foo/simple_header%d.h",
+                                         "foo/bar/simple_header%d.h",
+                                         "foo/bar/baz/simple_header%d.h" };
 
-const char* include_header_name_templates[4]   = { "#include \"simple_header%d.h\"\n",
-                                                   "#include \"foo/simple_header%d.h\"\n",
-                                                   "#include \"foo/bar/simple_header%d.h\"\n",
-                                                   "#include \"foo/bar/baz/simple_header%d.h\"\n"};
+const char *include_header_name_templates[4] = {
+    "#include \"simple_header%d.h\"\n", "#include \"foo/simple_header%d.h\"\n",
+    "#include \"foo/bar/simple_header%d.h\"\n",
+    "#include \"foo/bar/baz/simple_header%d.h\"\n"
+};
 
-const char* compile_extern_var      = "extern constant float foo;\n";
-const char* compile_extern_struct   = "extern constant struct bar bart;\n";
-const char* compile_extern_function = "extern int baz(int, int);\n";
+const char *compile_extern_var = "extern constant float foo;\n";
+const char *compile_extern_struct = "extern constant struct bar bart;\n";
+const char *compile_extern_function = "extern int baz(int, int);\n";
 
-const char* compile_static_var      = "static constant float foo = 2.78;\n";
-const char* compile_static_struct   = "static constant struct bar {float x, y, z, r; int color; } foo = {3.14159};\n";
-const char* compile_static_function = "static int foo(int x, int y) { return x*x + y*y; }\n";
+const char *compile_static_var = "static constant float foo = 2.78;\n";
+const char *compile_static_struct = "static constant struct bar {float x, y, "
+                                    "z, r; int color; } foo = {3.14159};\n";
+const char *compile_static_function =
+    "static int foo(int x, int y) { return x*x + y*y; }\n";
 
-const char* compile_regular_var      = "constant float foo = 4.0f;\n";
-const char* compile_regular_struct   = "constant struct bar {float x, y, z, r; int color; } foo = {0.f, 0.f, 0.f, 0.f, 0};\n";
-const char* compile_regular_function = "int foo(int x, int y) { return x*x + y*y; }\n";
+const char *compile_regular_var = "constant float foo = 4.0f;\n";
+const char *compile_regular_struct =
+    "constant struct bar {float x, y, z, r; int color; } foo = {0.f, 0.f, 0.f, "
+    "0.f, 0};\n";
+const char *compile_regular_function =
+    "int foo(int x, int y) { return x*x + y*y; }\n";
 
-const char* link_static_var_access = // use with compile_static_var
-"extern constant float foo;\n"
-"float access_foo() { return foo; }\n";
+const char *link_static_var_access = // use with compile_static_var
+    "extern constant float foo;\n"
+    "float access_foo() { return foo; }\n";
 
-const char* link_static_struct_access = // use with compile_static_struct
-"extern constant struct bar{float x, y, z, r; int color; } foo;\n"
-"struct bar access_foo() {return foo; }\n";
+const char *link_static_struct_access = // use with compile_static_struct
+    "extern constant struct bar{float x, y, z, r; int color; } foo;\n"
+    "struct bar access_foo() {return foo; }\n";
 
-const char* link_static_function_access = // use with compile_static_function
-"extern int foo(int, int);\n"
-"int access_foo() { int blah = foo(3, 4); return blah + 5; }\n";
+const char *link_static_function_access = // use with compile_static_function
+    "extern int foo(int, int);\n"
+    "int access_foo() { int blah = foo(3, 4); return blah + 5; }\n";
 
-int test_large_single_compile(cl_context context, cl_device_id deviceID, unsigned int numLines)
+int test_large_single_compile(cl_context context, cl_device_id deviceID,
+                              unsigned int numLines)
 {
     int error;
     cl_program program;
@@ -166,96 +165,113 @@
     MTdata d;
 
     /* First, allocate the array for our line pointers */
-    lines = (const char **)malloc( numLines * sizeof( const char * ) );
-    if (lines == NULL) {
-        log_error( "ERROR: Unable to allocate lines array with %d lines! (in %s:%d)\n", numLines, __FILE__, __LINE__);
+    lines = (const char **)malloc(numLines * sizeof(const char *));
+    if (lines == NULL)
+    {
+        log_error(
+            "ERROR: Unable to allocate lines array with %d lines! (in %s:%d)\n",
+            numLines, __FILE__, __LINE__);
         return -1;
     }
 
     /* First and last lines are easy */
-    lines[ 0 ] = sample_kernel_start;
-    lines[ numLines - 1 ] = sample_kernel_end;
+    lines[0] = sample_kernel_start;
+    lines[numLines - 1] = sample_kernel_end;
 
-    numChoices = sizeof( sample_kernel_lines ) / sizeof( sample_kernel_lines[ 0 ] );
+    numChoices = sizeof(sample_kernel_lines) / sizeof(sample_kernel_lines[0]);
 
     /* Fill the rest with random lines to hopefully prevent much optimization */
-    d = init_genrand( gRandomSeed );
-    for( i = 1; i < numLines - 1; i++ )
+    d = init_genrand(gRandomSeed);
+    for (i = 1; i < numLines - 1; i++)
     {
-        lines[ i ] = sample_kernel_lines[ genrand_int32(d) % numChoices ];
+        lines[i] = sample_kernel_lines[genrand_int32(d) % numChoices];
     }
-    free_mtdata(d);     d = NULL;
+    free_mtdata(d);
+    d = NULL;
 
     /* Try to create a program with these lines */
-    error = create_single_kernel_helper_create_program(context, &program, numLines, lines);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program,
+                                                       numLines, lines);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create long test program with %d lines! (%s in %s:%d)", numLines, IGetErrorString( error ), __FILE__, __LINE__ );
-        free( lines );
+        log_error("ERROR: Unable to create long test program with %d lines! "
+                  "(%s in %s:%d)",
+                  numLines, IGetErrorString(error), __FILE__, __LINE__);
+        free(lines);
         if (program != NULL)
         {
-           error = clReleaseProgram( program );
-           test_error( error, "Unable to release a program object" );
+            error = clReleaseProgram(program);
+            test_error(error, "Unable to release a program object");
         }
         return -1;
     }
 
     /* Build it */
-    error = clBuildProgram( program, 1, &deviceID, NULL, NULL, NULL );
-    test_error( error, "Unable to build a long program" );
+    error = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);
+    test_error(error, "Unable to build a long program");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release a program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release a program object");
 
-    free( lines );
+    free(lines);
 
     return 0;
 }
 
-int test_large_compile(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_large_compile(cl_device_id deviceID, cl_context context,
+                       cl_command_queue queue, int num_elements)
 {
-    unsigned int toTest[] = { 64, 128, 256, 512, 1024, 2048, 4096, 0 }; //8192, 16384, 32768, 0 };
+    unsigned int toTest[] = {
+        64, 128, 256, 512, 1024, 2048, 4096, 0
+    }; // 8192, 16384, 32768, 0 };
     unsigned int i;
 
-    log_info( "Testing large compiles...this might take awhile...\n" );
+    log_info("Testing large compiles...this might take awhile...\n");
 
-    for( i = 0; toTest[ i ] != 0; i++ )
+    for (i = 0; toTest[i] != 0; i++)
     {
-        log_info( "   %d...\n", toTest[ i ] );
+        log_info("   %d...\n", toTest[i]);
 
 #if defined(_WIN32)
         clock_t start = clock();
-#elif  defined(__linux__) || defined(__APPLE__)
-    timeval time1, time2;
-    gettimeofday(&time1, NULL);
+#elif defined(__linux__) || defined(__APPLE__)
+        timeval time1, time2;
+        gettimeofday(&time1, NULL);
 #endif
 
-        if( test_large_single_compile( context, deviceID, toTest[ i ] ) != 0 )
+        if (test_large_single_compile(context, deviceID, toTest[i]) != 0)
         {
-            log_error( "ERROR: long program test failed for %d lines! (in %s:%d)\n", toTest[ i ], __FILE__, __LINE__);
+            log_error(
+                "ERROR: long program test failed for %d lines! (in %s:%d)\n",
+                toTest[i], __FILE__, __LINE__);
             return -1;
         }
 
 #if defined(_WIN32)
         clock_t end = clock();
-    log_perf( (float)( end - start ) / (float)CLOCKS_PER_SEC, false, "clock() time in secs", "%d lines", toTest[i] );
-#elif  defined(__linux__) || defined(__APPLE__)
-    gettimeofday(&time2, NULL);
-    log_perf( (float)(float)(time2.tv_sec  - time1.tv_sec) + 1.0e-6 * (time2.tv_usec - time1.tv_usec) , false, "wall time in secs", "%d lines", toTest[i] );
+        log_perf((float)(end - start) / (float)CLOCKS_PER_SEC, false,
+                 "clock() time in secs", "%d lines", toTest[i]);
+#elif defined(__linux__) || defined(__APPLE__)
+        gettimeofday(&time2, NULL);
+        log_perf((float)(float)(time2.tv_sec - time1.tv_sec)
+                     + 1.0e-6 * (time2.tv_usec - time1.tv_usec),
+                 false, "wall time in secs", "%d lines", toTest[i]);
 #endif
     }
 
     return 0;
 }
 
-static int verifyCopyBuffer(cl_context context, cl_command_queue queue, cl_kernel kernel);
+static int verifyCopyBuffer(cl_context context, cl_command_queue queue,
+                            cl_kernel kernel);
 
 #if defined(__APPLE__) || defined(__linux)
 #define _strdup strdup
 #endif
 
-int test_large_multi_file_library(cl_context context, cl_device_id deviceID, cl_command_queue queue, unsigned int numLines)
+int test_large_multi_file_library(cl_context context, cl_device_id deviceID,
+                                  cl_command_queue queue, unsigned int numLines)
 {
     int error;
     cl_program program;
@@ -264,164 +280,194 @@
     unsigned int i;
     char buffer[MAX_LINE_SIZE_IN_PROGRAM];
 
-    simple_kernels = (cl_program*)malloc(numLines*sizeof(cl_program));
-    if (simple_kernels == NULL) {
-        log_error( "ERROR: Unable to allocate kernels array with %d kernels! (in %s:%d)\n", numLines, __FILE__, __LINE__);
+    simple_kernels = (cl_program *)malloc(numLines * sizeof(cl_program));
+    if (simple_kernels == NULL)
+    {
+        log_error("ERROR: Unable to allocate kernels array with %d kernels! "
+                  "(in %s:%d)\n",
+                  numLines, __FILE__, __LINE__);
         return -1;
     }
     /* First, allocate the array for our line pointers */
-    lines = (const char **)malloc( (2*numLines + 2) * sizeof( const char * ) );
-    if (lines == NULL) {
+    lines = (const char **)malloc((2 * numLines + 2) * sizeof(const char *));
+    if (lines == NULL)
+    {
         free(simple_kernels);
-        log_error( "ERROR: Unable to allocate lines array with %d lines! (in %s:%d)\n", (2*numLines + 2), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to allocate lines array with %d lines! (in %s:%d)\n",
+            (2 * numLines + 2), __FILE__, __LINE__);
         return -1;
     }
 
-    for( i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
         sprintf(buffer, composite_kernel_extern_template, i);
         lines[i] = _strdup(buffer);
     }
     /* First and last lines are easy */
-    lines[ numLines ] = composite_kernel_start;
-    lines[ 2* numLines + 1] = composite_kernel_end;
+    lines[numLines] = composite_kernel_start;
+    lines[2 * numLines + 1] = composite_kernel_end;
 
     /* Fill the rest with templated kernels */
-    for( i = numLines + 1; i < 2* numLines + 1; i++ )
+    for (i = numLines + 1; i < 2 * numLines + 1; i++)
     {
         sprintf(buffer, composite_kernel_template, i - numLines - 1);
-        lines[ i ] = _strdup(buffer);
+        lines[i] = _strdup(buffer);
     }
 
     /* Try to create a program with these lines */
-    error = create_single_kernel_helper_create_program(context, &program, 2 * numLines + 2, lines);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program,
+                                                       2 * numLines + 2, lines);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create long test program with %d lines! (%s) (in %s:%d)\n", numLines, IGetErrorString( error ), __FILE__, __LINE__  );
-        free( simple_kernels );
-        for( i = 0; i < numLines; i++)
+        log_error("ERROR: Unable to create long test program with %d lines! "
+                  "(%s) (in %s:%d)\n",
+                  numLines, IGetErrorString(error), __FILE__, __LINE__);
+        free(simple_kernels);
+        for (i = 0; i < numLines; i++)
         {
-            free( (void*)lines[i] );
-            free( (void*)lines[i+numLines+1] );
+            free((void *)lines[i]);
+            free((void *)lines[i + numLines + 1]);
         }
-        free( lines );
+        free(lines);
         if (program != NULL)
         {
-           error = clReleaseProgram( program );
-           test_error( error, "Unable to release program object" );
+            error = clReleaseProgram(program);
+            test_error(error, "Unable to release program object");
         }
 
         return -1;
     }
 
     /* Compile it */
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple program");
 
     /* Create and compile templated kernels */
-    for( i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
         sprintf(buffer, simple_kernel_template, i);
-        const char* kernel_source = _strdup(buffer);
-        simple_kernels[i] = clCreateProgramWithSource( context, 1, &kernel_source, NULL, &error );
-        if( simple_kernels[i] == NULL || error != CL_SUCCESS )
+        const char *kernel_source = _strdup(buffer);
+        simple_kernels[i] =
+            clCreateProgramWithSource(context, 1, &kernel_source, NULL, &error);
+        if (simple_kernels[i] == NULL || error != CL_SUCCESS)
         {
-            log_error( "ERROR: Unable to create long test program with %d lines! (%s) (in %s:%d)\n", numLines, IGetErrorString( error ), __FILE__, __LINE__  );
+            log_error("ERROR: Unable to create long test program with %d "
+                      "lines! (%s) (in %s:%d)\n",
+                      numLines, IGetErrorString(error), __FILE__, __LINE__);
             return -1;
         }
 
         /* Compile it */
-        error = clCompileProgram(simple_kernels[i], 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-        test_error( error, "Unable to compile a simple program" );
+        error = clCompileProgram(simple_kernels[i], 1, &deviceID, NULL, 0, NULL,
+                                 NULL, NULL, NULL);
+        test_error(error, "Unable to compile a simple program");
 
-        free((void*)kernel_source);
+        free((void *)kernel_source);
     }
 
     /* Create library out of compiled templated kernels */
-    cl_program my_newly_minted_library = clLinkProgram(context, 1, &deviceID, "-create-library", numLines, simple_kernels, NULL, NULL, &error);
-    test_error( error, "Unable to create a multi-line library" );
+    cl_program my_newly_minted_library =
+        clLinkProgram(context, 1, &deviceID, "-create-library", numLines,
+                      simple_kernels, NULL, NULL, &error);
+    test_error(error, "Unable to create a multi-line library");
 
-    /* Link the program that calls the kernels and the library that contains them */
+    /* Link the program that calls the kernels and the library that contains
+     * them */
     cl_program programs[2] = { program, my_newly_minted_library };
-    cl_program my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, 2, programs, NULL, NULL, &error);
-    test_error( error, "Unable to link a program with a library" );
+    cl_program my_newly_linked_program = clLinkProgram(
+        context, 1, &deviceID, NULL, 2, programs, NULL, NULL, &error);
+    test_error(error, "Unable to link a program with a library");
 
     // Create the composite kernel
-    cl_kernel kernel = clCreateKernel(my_newly_linked_program, "CompositeKernel", &error);
-    test_error( error, "Unable to create a composite kernel" );
+    cl_kernel kernel =
+        clCreateKernel(my_newly_linked_program, "CompositeKernel", &error);
+    test_error(error, "Unable to create a composite kernel");
 
     // Run the composite kernel and verify the results
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    for( i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
-        free( (void*)lines[i] );
-        free( (void*)lines[i+numLines+1] );
+        free((void *)lines[i]);
+        free((void *)lines[i + numLines + 1]);
     }
-    free( lines );
+    free(lines);
 
-    for(i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
-        error = clReleaseProgram( simple_kernels[i] );
-        test_error( error, "Unable to release program object" );
+        error = clReleaseProgram(simple_kernels[i]);
+        test_error(error, "Unable to release program object");
     }
-    free( simple_kernels );
+    free(simple_kernels);
 
-  error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
 
-    error = clReleaseProgram( my_newly_minted_library );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_minted_library);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_multi_file_libraries(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_multi_file_libraries(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, int num_elements)
 {
-    unsigned int toTest[] = { 2, 4, 8, 16, 32, 64, 128, 256, 0 }; // 512, 1024, 2048, 4096, 8192, 16384, 32768, 0 };
+    unsigned int toTest[] = {
+        2, 4, 8, 16, 32, 64, 128, 256, 0
+    }; // 512, 1024, 2048, 4096, 8192, 16384, 32768, 0 };
     unsigned int i;
 
-    log_info( "Testing multi-file libraries ...this might take awhile...\n" );
+    log_info("Testing multi-file libraries ...this might take awhile...\n");
 
-    for( i = 0; toTest[ i ] != 0; i++ )
+    for (i = 0; toTest[i] != 0; i++)
     {
-        log_info( "   %d...\n", toTest[ i ] );
+        log_info("   %d...\n", toTest[i]);
 
 #if defined(_WIN32)
         clock_t start = clock();
-#elif  defined(__linux__) || defined(__APPLE__)
-    timeval time1, time2;
-    gettimeofday(&time1, NULL);
+#elif defined(__linux__) || defined(__APPLE__)
+        timeval time1, time2;
+        gettimeofday(&time1, NULL);
 #endif
 
-        if( test_large_multi_file_library( context, deviceID, queue, toTest[ i ] ) != 0 )
+        if (test_large_multi_file_library(context, deviceID, queue, toTest[i])
+            != 0)
         {
-            log_error( "ERROR: multi-file library program test failed for %d lines! (in %s:%d)\n\n", toTest[ i ], __FILE__, __LINE__  );
+            log_error("ERROR: multi-file library program test failed for %d "
+                      "lines! (in %s:%d)\n\n",
+                      toTest[i], __FILE__, __LINE__);
             return -1;
         }
 
 #if defined(_WIN32)
         clock_t end = clock();
-    log_perf( (float)( end - start ) / (float)CLOCKS_PER_SEC, false, "clock() time in secs", "%d lines", toTest[i] );
-#elif  defined(__linux__) || defined(__APPLE__)
-    gettimeofday(&time2, NULL);
-    log_perf( (float)(float)(time2.tv_sec  - time1.tv_sec) + 1.0e-6 * (time2.tv_usec - time1.tv_usec) , false, "wall time in secs", "%d lines", toTest[i] );
+        log_perf((float)(end - start) / (float)CLOCKS_PER_SEC, false,
+                 "clock() time in secs", "%d lines", toTest[i]);
+#elif defined(__linux__) || defined(__APPLE__)
+        gettimeofday(&time2, NULL);
+        log_perf((float)(float)(time2.tv_sec - time1.tv_sec)
+                     + 1.0e-6 * (time2.tv_usec - time1.tv_usec),
+                 false, "wall time in secs", "%d lines", toTest[i]);
 #endif
     }
 
     return 0;
 }
 
-int test_large_multiple_embedded_headers(cl_context context, cl_device_id deviceID, cl_command_queue queue, unsigned int numLines)
+int test_large_multiple_embedded_headers(cl_context context,
+                                         cl_device_id deviceID,
+                                         cl_command_queue queue,
+                                         unsigned int numLines)
 {
     int error;
     cl_program program;
@@ -432,29 +478,41 @@
     unsigned int i;
     char buffer[MAX_LINE_SIZE_IN_PROGRAM];
 
-    simple_kernels = (cl_program*)malloc(numLines*sizeof(cl_program));
-    if (simple_kernels == NULL) {
-        log_error( "ERROR: Unable to allocate simple_kernels array with %d lines! (in %s:%d)\n", numLines, __FILE__, __LINE__ );
+    simple_kernels = (cl_program *)malloc(numLines * sizeof(cl_program));
+    if (simple_kernels == NULL)
+    {
+        log_error("ERROR: Unable to allocate simple_kernels array with %d "
+                  "lines! (in %s:%d)\n",
+                  numLines, __FILE__, __LINE__);
         return -1;
     }
-    headers = (cl_program*)malloc(numLines*sizeof(cl_program));
-    if (headers == NULL) {
-        log_error( "ERROR: Unable to allocate headers array with %d lines! (in %s:%d)\n", numLines, __FILE__, __LINE__ );
+    headers = (cl_program *)malloc(numLines * sizeof(cl_program));
+    if (headers == NULL)
+    {
+        log_error("ERROR: Unable to allocate headers array with %d lines! (in "
+                  "%s:%d)\n",
+                  numLines, __FILE__, __LINE__);
         return -1;
     }
     /* First, allocate the array for our line pointers */
-    header_names = (const char**)malloc( numLines*sizeof( const char * ) );
-    if (header_names == NULL) {
-        log_error( "ERROR: Unable to allocate header_names array with %d lines! (in %s:%d)\n", numLines, __FILE__, __LINE__ );
+    header_names = (const char **)malloc(numLines * sizeof(const char *));
+    if (header_names == NULL)
+    {
+        log_error("ERROR: Unable to allocate header_names array with %d lines! "
+                  "(in %s:%d)\n",
+                  numLines, __FILE__, __LINE__);
         return -1;
     }
-    lines = (const char **)malloc( (2*numLines + 2)*sizeof( const char * ) );
-    if (lines == NULL) {
-        log_error( "ERROR: Unable to allocate lines array with %d lines! (in %s:%d)\n", (2*numLines + 2), __FILE__, __LINE__ );
+    lines = (const char **)malloc((2 * numLines + 2) * sizeof(const char *));
+    if (lines == NULL)
+    {
+        log_error(
+            "ERROR: Unable to allocate lines array with %d lines! (in %s:%d)\n",
+            (2 * numLines + 2), __FILE__, __LINE__);
         return -1;
     }
 
-    for( i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
         sprintf(buffer, include_header_name_templates[i % 4], i);
         lines[i] = _strdup(buffer);
@@ -462,154 +520,178 @@
         header_names[i] = _strdup(buffer);
 
         sprintf(buffer, composite_kernel_extern_template, i);
-        const char* line = _strdup(buffer);
-        error = create_single_kernel_helper_create_program(context, &headers[i], 1, &line);
-        if( headers[i] == NULL || error != CL_SUCCESS )
+        const char *line = buffer;
+        error = create_single_kernel_helper_create_program(context, &headers[i],
+                                                           1, &line);
+        if (headers[i] == NULL || error != CL_SUCCESS)
         {
-            log_error( "ERROR: Unable to create a simple header program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__);
+            log_error("ERROR: Unable to create a simple header program! (%s in "
+                      "%s:%d)\n",
+                      IGetErrorString(error), __FILE__, __LINE__);
             return -1;
         }
     }
     /* First and last lines are easy */
-    lines[ numLines ] = composite_kernel_start;
-    lines[ 2* numLines + 1 ] = composite_kernel_end;
+    lines[numLines] = composite_kernel_start;
+    lines[2 * numLines + 1] = composite_kernel_end;
 
     /* Fill the rest with templated kernels */
-    for( i = numLines + 1; i < 2* numLines + 1; i++ )
+    for (i = numLines + 1; i < 2 * numLines + 1; i++)
     {
         sprintf(buffer, composite_kernel_template, i - numLines - 1);
-        lines[ i ] = _strdup(buffer);
+        lines[i] = _strdup(buffer);
     }
 
     /* Try to create a program with these lines */
-    error = create_single_kernel_helper_create_program(context, &program, 2 * numLines + 2, lines);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program,
+                                                       2 * numLines + 2, lines);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create long test program with %d lines! (%s) (in %s:%d)\n", numLines, IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create long test program with %d lines! "
+                  "(%s) (in %s:%d)\n",
+                  numLines, IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     /* Compile it */
-    error = clCompileProgram(program, 1, &deviceID, NULL, numLines, headers, header_names, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, numLines, headers,
+                             header_names, NULL, NULL);
+    test_error(error, "Unable to compile a simple program");
 
     /* Create and compile templated kernels */
-    for( i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
         sprintf(buffer, simple_kernel_template, i);
-        const char* kernel_source = _strdup(buffer);
-        error = create_single_kernel_helper_create_program(context, &simple_kernels[i], 1, &kernel_source);
-        if( simple_kernels[i] == NULL || error != CL_SUCCESS )
+        const char *kernel_source = _strdup(buffer);
+        error = create_single_kernel_helper_create_program(
+            context, &simple_kernels[i], 1, &kernel_source);
+        if (simple_kernels[i] == NULL || error != CL_SUCCESS)
         {
-            log_error( "ERROR: Unable to create long test program with %d lines! (%s) (in %s:%d)\n", numLines, IGetErrorString( error ), __FILE__, __LINE__ );
+            log_error("ERROR: Unable to create long test program with %d "
+                      "lines! (%s) (in %s:%d)\n",
+                      numLines, IGetErrorString(error), __FILE__, __LINE__);
             return -1;
         }
 
         /* Compile it */
-        error = clCompileProgram(simple_kernels[i], 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-        test_error( error, "Unable to compile a simple program" );
+        error = clCompileProgram(simple_kernels[i], 1, &deviceID, NULL, 0, NULL,
+                                 NULL, NULL, NULL);
+        test_error(error, "Unable to compile a simple program");
 
-        free((void*)kernel_source);
+        free((void *)kernel_source);
     }
 
     /* Create library out of compiled templated kernels */
-    cl_program my_newly_minted_library = clLinkProgram(context, 1, &deviceID, "-create-library", numLines, simple_kernels, NULL, NULL, &error);
-    test_error( error, "Unable to create a multi-line library" );
+    cl_program my_newly_minted_library =
+        clLinkProgram(context, 1, &deviceID, "-create-library", numLines,
+                      simple_kernels, NULL, NULL, &error);
+    test_error(error, "Unable to create a multi-line library");
 
-    /* Link the program that calls the kernels and the library that contains them */
+    /* Link the program that calls the kernels and the library that contains
+     * them */
     cl_program programs[2] = { program, my_newly_minted_library };
-    cl_program my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, 2, programs, NULL, NULL, &error);
-    test_error( error, "Unable to link a program with a library" );
+    cl_program my_newly_linked_program = clLinkProgram(
+        context, 1, &deviceID, NULL, 2, programs, NULL, NULL, &error);
+    test_error(error, "Unable to link a program with a library");
 
     // Create the composite kernel
-    cl_kernel kernel = clCreateKernel(my_newly_linked_program, "CompositeKernel", &error);
-    test_error( error, "Unable to create a composite kernel" );
+    cl_kernel kernel =
+        clCreateKernel(my_newly_linked_program, "CompositeKernel", &error);
+    test_error(error, "Unable to create a composite kernel");
 
     // Run the composite kernel and verify the results
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    for( i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
-        free( (void*)lines[i] );
-        free( (void*)header_names[i] );
+        free((void *)lines[i]);
+        free((void *)header_names[i]);
     }
-    for( i = numLines + 1; i < 2* numLines + 1; i++ )
+    for (i = numLines + 1; i < 2 * numLines + 1; i++)
     {
-        free( (void*)lines[i] );
+        free((void *)lines[i]);
     }
-    free( lines );
-    free( header_names );
+    free(lines);
+    free(header_names);
 
-    for(i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
-        error = clReleaseProgram( simple_kernels[i] );
-        test_error( error, "Unable to release program object" );
-        error = clReleaseProgram( headers[i] );
-        test_error( error, "Unable to release header program object" );
+        error = clReleaseProgram(simple_kernels[i]);
+        test_error(error, "Unable to release program object");
+        error = clReleaseProgram(headers[i]);
+        test_error(error, "Unable to release header program object");
     }
-    free( simple_kernels );
-    free( headers );
+    free(simple_kernels);
+    free(headers);
 
-    error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
 
-    error = clReleaseProgram( my_newly_minted_library );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_minted_library);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_multiple_embedded_headers(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_multiple_embedded_headers(cl_device_id deviceID, cl_context context,
+                                   cl_command_queue queue, int num_elements)
 {
-    unsigned int toTest[] = { 2, 4, 8, 16, 32, 64, 128, 256, 0 }; // 512, 1024, 2048, 4096, 8192, 16384, 32768, 0 };
+    unsigned int toTest[] = {
+        2, 4, 8, 16, 32, 64, 128, 256, 0
+    }; // 512, 1024, 2048, 4096, 8192, 16384, 32768, 0 };
     unsigned int i;
 
-    log_info( "Testing multiple embedded headers ...this might take awhile...\n" );
+    log_info(
+        "Testing multiple embedded headers ...this might take awhile...\n");
 
-    for( i = 0; toTest[ i ] != 0; i++ )
+    for (i = 0; toTest[i] != 0; i++)
     {
-        log_info( "   %d...\n", toTest[ i ] );
+        log_info("   %d...\n", toTest[i]);
 
 #if defined(_WIN32)
         clock_t start = clock();
-#elif  defined(__linux__) || defined(__APPLE__)
-    timeval time1, time2;
-    gettimeofday(&time1, NULL);
+#elif defined(__linux__) || defined(__APPLE__)
+        timeval time1, time2;
+        gettimeofday(&time1, NULL);
 #endif
 
-        if( test_large_multiple_embedded_headers( context, deviceID, queue, toTest[ i ] ) != 0 )
+        if (test_large_multiple_embedded_headers(context, deviceID, queue,
+                                                 toTest[i])
+            != 0)
         {
-            log_error( "ERROR: multiple embedded headers program test failed for %d lines! (in %s:%d)\n", toTest[ i ], __FILE__, __LINE__ );
+            log_error("ERROR: multiple embedded headers program test failed "
+                      "for %d lines! (in %s:%d)\n",
+                      toTest[i], __FILE__, __LINE__);
             return -1;
         }
 
 #if defined(_WIN32)
         clock_t end = clock();
-    log_perf( (float)( end - start ) / (float)CLOCKS_PER_SEC, false, "clock() time in secs", "%d lines", toTest[i] );
-#elif  defined(__linux__) || defined(__APPLE__)
-    gettimeofday(&time2, NULL);
-    log_perf( (float)(float)(time2.tv_sec  - time1.tv_sec) + 1.0e-6 * (time2.tv_usec - time1.tv_usec) , false, "wall time in secs", "%d lines", toTest[i] );
+        log_perf((float)(end - start) / (float)CLOCKS_PER_SEC, false,
+                 "clock() time in secs", "%d lines", toTest[i]);
+#elif defined(__linux__) || defined(__APPLE__)
+        gettimeofday(&time2, NULL);
+        log_perf((float)(float)(time2.tv_sec - time1.tv_sec)
+                     + 1.0e-6 * (time2.tv_usec - time1.tv_usec),
+                 false, "wall time in secs", "%d lines", toTest[i]);
 #endif
     }
 
     return 0;
 }
 
-double logbase(double a, double base)
-{
-   return log(a) / log(base);
-}
+double logbase(double a, double base) { return log(a) / log(base); }
 
-int test_large_multiple_libraries(cl_context context, cl_device_id deviceID, cl_command_queue queue, unsigned int numLines)
+int test_large_multiple_libraries(cl_context context, cl_device_id deviceID,
+                                  cl_command_queue queue, unsigned int numLines)
 {
     int error;
     cl_program *simple_kernels;
@@ -617,164 +699,202 @@
     unsigned int i;
     char buffer[MAX_LINE_SIZE_IN_PROGRAM];
     /* I want to create (log2(N)+1)/2 libraries */
-    unsigned int level = (unsigned int)(logbase(numLines, 2.0) + 1.000001)/2;
+    unsigned int level = (unsigned int)(logbase(numLines, 2.0) + 1.000001) / 2;
     unsigned int numLibraries = (unsigned int)pow(2.0, level - 1.0);
-    unsigned int numFilesInLib = numLines/numLibraries;
-    cl_program *my_program_and_libraries = (cl_program*)malloc((1+numLibraries)*sizeof(cl_program));
-    if (my_program_and_libraries == NULL) {
-        log_error( "ERROR: Unable to allocate program array with %d programs! (in %s:%d)\n", (1+numLibraries), __FILE__, __LINE__);
+    unsigned int numFilesInLib = numLines / numLibraries;
+    cl_program *my_program_and_libraries =
+        (cl_program *)malloc((1 + numLibraries) * sizeof(cl_program));
+    if (my_program_and_libraries == NULL)
+    {
+        log_error("ERROR: Unable to allocate program array with %d programs! "
+                  "(in %s:%d)\n",
+                  (1 + numLibraries), __FILE__, __LINE__);
         return -1;
     }
 
-    log_info("level - %d, numLibraries - %d, numFilesInLib - %d\n", level, numLibraries, numFilesInLib);
+    log_info("level - %d, numLibraries - %d, numFilesInLib - %d\n", level,
+             numLibraries, numFilesInLib);
 
-    simple_kernels = (cl_program*)malloc(numLines*sizeof(cl_program));
-    if (simple_kernels == NULL) {
-        log_error( "ERROR: Unable to allocate kernels array with %d kernels! (in %s:%d)\n", numLines, __FILE__, __LINE__);
+    simple_kernels = (cl_program *)malloc(numLines * sizeof(cl_program));
+    if (simple_kernels == NULL)
+    {
+        log_error("ERROR: Unable to allocate kernels array with %d kernels! "
+                  "(in %s:%d)\n",
+                  numLines, __FILE__, __LINE__);
         return -1;
     }
     /* First, allocate the array for our line pointers */
-    lines = (const char **)malloc( (2*numLines + 2) * sizeof( const char * ) );
-    if (lines == NULL) {
-        log_error( "ERROR: Unable to allocate lines array with %d lines! (in %s:%d)\n", (2*numLines + 2), __FILE__, __LINE__);
+    lines = (const char **)malloc((2 * numLines + 2) * sizeof(const char *));
+    if (lines == NULL)
+    {
+        log_error(
+            "ERROR: Unable to allocate lines array with %d lines! (in %s:%d)\n",
+            (2 * numLines + 2), __FILE__, __LINE__);
         return -1;
     }
 
-    for(i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
         sprintf(buffer, composite_kernel_extern_template, i);
         lines[i] = _strdup(buffer);
     }
     /* First and last lines are easy */
-    lines[ numLines ] = composite_kernel_start;
-    lines[ 2*numLines + 1] = composite_kernel_end;
+    lines[numLines] = composite_kernel_start;
+    lines[2 * numLines + 1] = composite_kernel_end;
 
     /* Fill the rest with templated kernels */
-    for(i = numLines + 1; i < 2*numLines + 1; i++ )
+    for (i = numLines + 1; i < 2 * numLines + 1; i++)
     {
         sprintf(buffer, composite_kernel_template, i - numLines - 1);
-        lines[ i ] = _strdup(buffer);
+        lines[i] = _strdup(buffer);
     }
 
     /* Try to create a program with these lines */
-    error = create_single_kernel_helper_create_program(context, &my_program_and_libraries[0], 2 * numLines + 2, lines);
-    if( my_program_and_libraries[0] == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(
+        context, &my_program_and_libraries[0], 2 * numLines + 2, lines);
+    if (my_program_and_libraries[0] == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create long test program with %d lines! (%s in %s:%d)\n", numLines, IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create long test program with %d lines! "
+                  "(%s in %s:%d)\n",
+                  numLines, IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     /* Compile it */
-    error = clCompileProgram(my_program_and_libraries[0], 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(my_program_and_libraries[0], 1, &deviceID, NULL, 0,
+                             NULL, NULL, NULL, NULL);
+    test_error(error, "Unable to compile a simple program");
 
     /* Create and compile templated kernels */
-    for(i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
         sprintf(buffer, simple_kernel_template, i);
-        const char* kernel_source = _strdup(buffer);
-        error = create_single_kernel_helper_create_program(context, &simple_kernels[i], 1, &kernel_source);
-        if( simple_kernels[i] == NULL || error != CL_SUCCESS )
+        const char *kernel_source = _strdup(buffer);
+        error = create_single_kernel_helper_create_program(
+            context, &simple_kernels[i], 1, &kernel_source);
+        if (simple_kernels[i] == NULL || error != CL_SUCCESS)
         {
-            log_error( "ERROR: Unable to create long test program with %d lines! (%s in %s:%d)\n", numLines, IGetErrorString( error ), __FILE__, __LINE__ );
+            log_error("ERROR: Unable to create long test program with %d "
+                      "lines! (%s in %s:%d)\n",
+                      numLines, IGetErrorString(error), __FILE__, __LINE__);
             return -1;
         }
 
         /* Compile it */
-        error = clCompileProgram(simple_kernels[i], 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-        test_error( error, "Unable to compile a simple program" );
+        error = clCompileProgram(simple_kernels[i], 1, &deviceID, NULL, 0, NULL,
+                                 NULL, NULL, NULL);
+        test_error(error, "Unable to compile a simple program");
 
-        free((void*)kernel_source);
+        free((void *)kernel_source);
     }
 
     /* Create library out of compiled templated kernels */
-    for(i = 0; i < numLibraries; i++) {
-        my_program_and_libraries[i+1] = clLinkProgram(context, 1, &deviceID, "-create-library", numFilesInLib, simple_kernels+i*numFilesInLib, NULL, NULL, &error);
-        test_error( error, "Unable to create a multi-line library" );
+    for (i = 0; i < numLibraries; i++)
+    {
+        my_program_and_libraries[i + 1] = clLinkProgram(
+            context, 1, &deviceID, "-create-library", numFilesInLib,
+            simple_kernels + i * numFilesInLib, NULL, NULL, &error);
+        test_error(error, "Unable to create a multi-line library");
     }
 
-    /* Link the program that calls the kernels and the library that contains them */
-    cl_program my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, numLibraries+1, my_program_and_libraries, NULL, NULL, &error);
-    test_error( error, "Unable to link a program with a library" );
+    /* Link the program that calls the kernels and the library that contains
+     * them */
+    cl_program my_newly_linked_program =
+        clLinkProgram(context, 1, &deviceID, NULL, numLibraries + 1,
+                      my_program_and_libraries, NULL, NULL, &error);
+    test_error(error, "Unable to link a program with a library");
 
     // Create the composite kernel
-    cl_kernel kernel = clCreateKernel(my_newly_linked_program, "CompositeKernel", &error);
-    test_error( error, "Unable to create a composite kernel" );
+    cl_kernel kernel =
+        clCreateKernel(my_newly_linked_program, "CompositeKernel", &error);
+    test_error(error, "Unable to create a composite kernel");
 
     // Run the composite kernel and verify the results
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    for(i = 0; i <= numLibraries; i++) {
-        error = clReleaseProgram( my_program_and_libraries[i] );
-        test_error( error, "Unable to release program object" );
-    }
-    free( my_program_and_libraries );
-    for(i = 0; i < numLines; i++)
+    for (i = 0; i <= numLibraries; i++)
     {
-        free( (void*)lines[i] );
+        error = clReleaseProgram(my_program_and_libraries[i]);
+        test_error(error, "Unable to release program object");
     }
-    for(i = numLines + 1; i < 2*numLines + 1; i++ )
+    free(my_program_and_libraries);
+    for (i = 0; i < numLines; i++)
     {
-        free( (void*)lines[i] );
+        free((void *)lines[i]);
     }
-    free( lines );
-
-    for(i = 0; i < numLines; i++)
+    for (i = numLines + 1; i < 2 * numLines + 1; i++)
     {
-        error = clReleaseProgram( simple_kernels[i] );
-        test_error( error, "Unable to release program object" );
+        free((void *)lines[i]);
     }
-    free( simple_kernels );
+    free(lines);
 
-    error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    for (i = 0; i < numLines; i++)
+    {
+        error = clReleaseProgram(simple_kernels[i]);
+        test_error(error, "Unable to release program object");
+    }
+    free(simple_kernels);
 
-    error = clReleaseProgram( my_newly_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
+
+    error = clReleaseProgram(my_newly_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_multiple_libraries(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_multiple_libraries(cl_device_id deviceID, cl_context context,
+                            cl_command_queue queue, int num_elements)
 {
-    unsigned int toTest[] = { 2, 8, 32, 128, 256, 0 }; // 512, 2048, 8192, 32768, 0 };
+    unsigned int toTest[] = {
+        2, 8, 32, 128, 256, 0
+    }; // 512, 2048, 8192, 32768, 0 };
     unsigned int i;
 
-    log_info( "Testing multiple libraries ...this might take awhile...\n" );
+    log_info("Testing multiple libraries ...this might take awhile...\n");
 
-    for( i = 0; toTest[ i ] != 0; i++ )
+    for (i = 0; toTest[i] != 0; i++)
     {
-        log_info( "   %d...\n", toTest[ i ] );
+        log_info("   %d...\n", toTest[i]);
 
 #if defined(_WIN32)
         clock_t start = clock();
-#elif  defined(__linux__) || defined(__APPLE__)
-    timeval time1, time2;
-    gettimeofday(&time1, NULL);
+#elif defined(__linux__) || defined(__APPLE__)
+        timeval time1, time2;
+        gettimeofday(&time1, NULL);
 #endif
 
-        if( test_large_multiple_libraries( context, deviceID, queue, toTest[ i ] ) != 0 )
+        if (test_large_multiple_libraries(context, deviceID, queue, toTest[i])
+            != 0)
         {
-            log_error( "ERROR: multiple library program test failed for %d lines! (in %s:%d)\n\n", toTest[ i ], __FILE__, __LINE__ );
+            log_error("ERROR: multiple library program test failed for %d "
+                      "lines! (in %s:%d)\n\n",
+                      toTest[i], __FILE__, __LINE__);
             return -1;
         }
 
 #if defined(_WIN32)
         clock_t end = clock();
-    log_perf( (float)( end - start ) / (float)CLOCKS_PER_SEC, false, "clock() time in secs", "%d lines", toTest[i] );
-#elif  defined(__linux__) || defined(__APPLE__)
-    gettimeofday(&time2, NULL);
-    log_perf( (float)(float)(time2.tv_sec  - time1.tv_sec) + 1.0e-6 * (time2.tv_usec - time1.tv_usec) , false, "wall time in secs", "%d lines", toTest[i] );
+        log_perf((float)(end - start) / (float)CLOCKS_PER_SEC, false,
+                 "clock() time in secs", "%d lines", toTest[i]);
+#elif defined(__linux__) || defined(__APPLE__)
+        gettimeofday(&time2, NULL);
+        log_perf((float)(float)(time2.tv_sec - time1.tv_sec)
+                     + 1.0e-6 * (time2.tv_usec - time1.tv_usec),
+                 false, "wall time in secs", "%d lines", toTest[i]);
 #endif
     }
 
     return 0;
 }
 
-int test_large_multiple_files_multiple_libraries(cl_context context, cl_device_id deviceID, cl_command_queue queue, unsigned int numLines)
+int test_large_multiple_files_multiple_libraries(cl_context context,
+                                                 cl_device_id deviceID,
+                                                 cl_command_queue queue,
+                                                 unsigned int numLines)
 {
     int error;
     cl_program *simple_kernels;
@@ -782,915 +902,1173 @@
     unsigned int i;
     char buffer[MAX_LINE_SIZE_IN_PROGRAM];
     /* I want to create (log2(N)+1)/4 libraries */
-    unsigned int level = (unsigned int)(logbase(numLines, 2.0) + 1.000001)/2;
+    unsigned int level = (unsigned int)(logbase(numLines, 2.0) + 1.000001) / 2;
     unsigned int numLibraries = (unsigned int)pow(2.0, level - 2.0);
-    unsigned int numFilesInLib = numLines/(2*numLibraries);
-    cl_program *my_programs_and_libraries = (cl_program*)malloc((1+numLibraries+numLibraries*numFilesInLib)*sizeof(cl_program));
-    if (my_programs_and_libraries == NULL) {
-        log_error( "ERROR: Unable to allocate program array with %d programs! (in %s:%d)\n", (1+numLibraries+numLibraries*numFilesInLib), __FILE__, __LINE__ );
+    unsigned int numFilesInLib = numLines / (2 * numLibraries);
+    cl_program *my_programs_and_libraries = (cl_program *)malloc(
+        (1 + numLibraries + numLibraries * numFilesInLib) * sizeof(cl_program));
+    if (my_programs_and_libraries == NULL)
+    {
+        log_error("ERROR: Unable to allocate program array with %d programs! "
+                  "(in %s:%d)\n",
+                  (1 + numLibraries + numLibraries * numFilesInLib), __FILE__,
+                  __LINE__);
         return -1;
     }
-    log_info("level - %d, numLibraries - %d, numFilesInLib - %d\n", level, numLibraries, numFilesInLib);
+    log_info("level - %d, numLibraries - %d, numFilesInLib - %d\n", level,
+             numLibraries, numFilesInLib);
 
-    simple_kernels = (cl_program*)malloc(numLines*sizeof(cl_program));
-    if (simple_kernels == NULL) {
-        log_error( "ERROR: Unable to allocate kernels array with %d kernels! (in %s:%d)\n", numLines, __FILE__, __LINE__ );
+    simple_kernels = (cl_program *)malloc(numLines * sizeof(cl_program));
+    if (simple_kernels == NULL)
+    {
+        log_error("ERROR: Unable to allocate kernels array with %d kernels! "
+                  "(in %s:%d)\n",
+                  numLines, __FILE__, __LINE__);
         return -1;
     }
     /* First, allocate the array for our line pointers */
-    lines = (const char **)malloc( (2*numLines + 2) * sizeof( const char * ) );
-    if (lines == NULL) {
-        log_error( "ERROR: Unable to allocate lines array with %d lines! (in %s:%d)\n", (2*numLines + 2), __FILE__, __LINE__ );
+    lines = (const char **)malloc((2 * numLines + 2) * sizeof(const char *));
+    if (lines == NULL)
+    {
+        log_error(
+            "ERROR: Unable to allocate lines array with %d lines! (in %s:%d)\n",
+            (2 * numLines + 2), __FILE__, __LINE__);
         return -1;
     }
 
-    for(i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
         sprintf(buffer, composite_kernel_extern_template, i);
         lines[i] = _strdup(buffer);
     }
     /* First and last lines are easy */
-    lines[ numLines ] = composite_kernel_start;
-    lines[ 2*numLines + 1] = composite_kernel_end;
+    lines[numLines] = composite_kernel_start;
+    lines[2 * numLines + 1] = composite_kernel_end;
 
     /* Fill the rest with templated kernels */
-    for(i = numLines + 1; i < 2*numLines + 1; i++ )
+    for (i = numLines + 1; i < 2 * numLines + 1; i++)
     {
         sprintf(buffer, composite_kernel_template, i - numLines - 1);
-        lines[ i ] = _strdup(buffer);
+        lines[i] = _strdup(buffer);
     }
 
     /* Try to create a program with these lines */
-    error = create_single_kernel_helper_create_program(context, &my_programs_and_libraries[0], 2 * numLines + 2, lines);
-    if( my_programs_and_libraries[0] == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(
+        context, &my_programs_and_libraries[0], 2 * numLines + 2, lines);
+    if (my_programs_and_libraries[0] == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create long test program with %d lines! (%s in %s:%d)\n", numLines, IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create long test program with %d lines! "
+                  "(%s in %s:%d)\n",
+                  numLines, IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     /* Compile it */
-    error = clCompileProgram(my_programs_and_libraries[0], 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(my_programs_and_libraries[0], 1, &deviceID, NULL,
+                             0, NULL, NULL, NULL, NULL);
+    test_error(error, "Unable to compile a simple program");
 
     /* Create and compile templated kernels */
-    for(i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
         sprintf(buffer, simple_kernel_template, i);
-        const char* kernel_source = _strdup(buffer);
-        error = create_single_kernel_helper_create_program(context, &simple_kernels[i], 1, &kernel_source);
-        if( simple_kernels[i] == NULL || error != CL_SUCCESS )
+        const char *kernel_source = _strdup(buffer);
+        error = create_single_kernel_helper_create_program(
+            context, &simple_kernels[i], 1, &kernel_source);
+        if (simple_kernels[i] == NULL || error != CL_SUCCESS)
         {
-            log_error( "ERROR: Unable to create long test program with %d lines! (%s in %s:%d)\n", numLines, IGetErrorString( error ), __FILE__, __LINE__ );
+            log_error("ERROR: Unable to create long test program with %d "
+                      "lines! (%s in %s:%d)\n",
+                      numLines, IGetErrorString(error), __FILE__, __LINE__);
             return -1;
         }
 
         /* Compile it */
-        error = clCompileProgram(simple_kernels[i], 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-        test_error( error, "Unable to compile a simple program" );
+        error = clCompileProgram(simple_kernels[i], 1, &deviceID, NULL, 0, NULL,
+                                 NULL, NULL, NULL);
+        test_error(error, "Unable to compile a simple program");
 
-        free((void*)kernel_source);
+        free((void *)kernel_source);
     }
 
     /* Copy already compiled kernels */
-    for( i = 0; i < numLibraries*numFilesInLib; i++) {
-        my_programs_and_libraries[i+1] = simple_kernels[i];
+    for (i = 0; i < numLibraries * numFilesInLib; i++)
+    {
+        my_programs_and_libraries[i + 1] = simple_kernels[i];
     }
 
     /* Create library out of compiled templated kernels */
-    for( i = 0; i < numLibraries; i++) {
-        my_programs_and_libraries[i+1+numLibraries*numFilesInLib] = clLinkProgram(context, 1, &deviceID, "-create-library", numFilesInLib, simple_kernels+(i*numFilesInLib+numLibraries*numFilesInLib), NULL, NULL, &error);
-        test_error( error, "Unable to create a multi-line library" );
+    for (i = 0; i < numLibraries; i++)
+    {
+        my_programs_and_libraries[i + 1 + numLibraries * numFilesInLib] =
+            clLinkProgram(
+                context, 1, &deviceID, "-create-library", numFilesInLib,
+                simple_kernels
+                    + (i * numFilesInLib + numLibraries * numFilesInLib),
+                NULL, NULL, &error);
+        test_error(error, "Unable to create a multi-line library");
     }
 
-    /* Link the program that calls the kernels and the library that contains them */
-    cl_program my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, numLibraries+1+numLibraries*numFilesInLib, my_programs_and_libraries, NULL, NULL, &error);
-    test_error( error, "Unable to link a program with a library" );
+    /* Link the program that calls the kernels and the library that contains
+     * them */
+    cl_program my_newly_linked_program =
+        clLinkProgram(context, 1, &deviceID, NULL,
+                      numLibraries + 1 + numLibraries * numFilesInLib,
+                      my_programs_and_libraries, NULL, NULL, &error);
+    test_error(error, "Unable to link a program with a library");
 
     // Create the composite kernel
-    cl_kernel kernel = clCreateKernel(my_newly_linked_program, "CompositeKernel", &error);
-    test_error( error, "Unable to create a composite kernel" );
+    cl_kernel kernel =
+        clCreateKernel(my_newly_linked_program, "CompositeKernel", &error);
+    test_error(error, "Unable to create a composite kernel");
 
     // Run the composite kernel and verify the results
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    for(i = 0; i < numLibraries+1+numLibraries*numFilesInLib; i++) {
-        error = clReleaseProgram( my_programs_and_libraries[i] );
-        test_error( error, "Unable to release program object" );
-    }
-    free( my_programs_and_libraries );
-
-    for(i = 0; i < numLines; i++)
+    for (i = 0; i < numLibraries + 1 + numLibraries * numFilesInLib; i++)
     {
-        free( (void*)lines[i] );
+        error = clReleaseProgram(my_programs_and_libraries[i]);
+        test_error(error, "Unable to release program object");
     }
-    for(i = numLines + 1; i < 2*numLines + 1; i++ )
+    free(my_programs_and_libraries);
+
+    for (i = 0; i < numLines; i++)
     {
-        free( (void*)lines[i] );
+        free((void *)lines[i]);
     }
-    free( lines );
-
-    for(i = numLibraries*numFilesInLib; i < numLines; i++)
+    for (i = numLines + 1; i < 2 * numLines + 1; i++)
     {
-        error = clReleaseProgram( simple_kernels[i] );
-        test_error( error, "Unable to release program object" );
+        free((void *)lines[i]);
     }
-    free( simple_kernels );
+    free(lines);
 
-    error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    for (i = numLibraries * numFilesInLib; i < numLines; i++)
+    {
+        error = clReleaseProgram(simple_kernels[i]);
+        test_error(error, "Unable to release program object");
+    }
+    free(simple_kernels);
 
-    error = clReleaseProgram( my_newly_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
+
+    error = clReleaseProgram(my_newly_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_multiple_files_multiple_libraries(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_multiple_files_multiple_libraries(cl_device_id deviceID,
+                                           cl_context context,
+                                           cl_command_queue queue,
+                                           int num_elements)
 {
-    unsigned int toTest[] = { 8, 32, 128, 256, 0 }; // 512, 2048, 8192, 32768, 0 };
+    unsigned int toTest[] = { 8, 32, 128, 256,
+                              0 }; // 512, 2048, 8192, 32768, 0 };
     unsigned int i;
 
-    log_info( "Testing multiple files and multiple libraries ...this might take awhile...\n" );
+    log_info("Testing multiple files and multiple libraries ...this might take "
+             "awhile...\n");
 
-    for( i = 0; toTest[ i ] != 0; i++ )
+    for (i = 0; toTest[i] != 0; i++)
     {
-        log_info( "   %d...\n", toTest[ i ] );
+        log_info("   %d...\n", toTest[i]);
 
 #if defined(_WIN32)
         clock_t start = clock();
-#elif  defined(__linux__) || defined(__APPLE__)
-    timeval time1, time2;
-    gettimeofday(&time1, NULL);
+#elif defined(__linux__) || defined(__APPLE__)
+        timeval time1, time2;
+        gettimeofday(&time1, NULL);
 #endif
 
-        if( test_large_multiple_files_multiple_libraries( context, deviceID, queue, toTest[ i ] ) != 0 )
+        if (test_large_multiple_files_multiple_libraries(context, deviceID,
+                                                         queue, toTest[i])
+            != 0)
         {
-            log_error( "ERROR: multiple files, multiple libraries program test failed for %d lines! (in %s:%d)\n\n", toTest[ i ], __FILE__, __LINE__ );
+            log_error("ERROR: multiple files, multiple libraries program test "
+                      "failed for %d lines! (in %s:%d)\n\n",
+                      toTest[i], __FILE__, __LINE__);
             return -1;
         }
 
 #if defined(_WIN32)
         clock_t end = clock();
-    log_perf( (float)( end - start ) / (float)CLOCKS_PER_SEC, false, "clock() time in secs", "%d lines", toTest[i] );
-#elif  defined(__linux__) || defined(__APPLE__)
-    gettimeofday(&time2, NULL);
-    log_perf( (float)(float)(time2.tv_sec  - time1.tv_sec) + 1.0e-6 * (time2.tv_usec - time1.tv_usec) , false, "wall time in secs", "%d lines", toTest[i] );
+        log_perf((float)(end - start) / (float)CLOCKS_PER_SEC, false,
+                 "clock() time in secs", "%d lines", toTest[i]);
+#elif defined(__linux__) || defined(__APPLE__)
+        gettimeofday(&time2, NULL);
+        log_perf((float)(float)(time2.tv_sec - time1.tv_sec)
+                     + 1.0e-6 * (time2.tv_usec - time1.tv_usec),
+                 false, "wall time in secs", "%d lines", toTest[i]);
 #endif
     }
 
     return 0;
 }
 
-int test_large_multiple_files(cl_context context, cl_device_id deviceID, cl_command_queue queue, unsigned int numLines)
+int test_large_multiple_files(cl_context context, cl_device_id deviceID,
+                              cl_command_queue queue, unsigned int numLines)
 {
     int error;
     const char **lines;
     unsigned int i;
     char buffer[MAX_LINE_SIZE_IN_PROGRAM];
-    cl_program *my_programs = (cl_program*)malloc((1+numLines)*sizeof(cl_program));
+    cl_program *my_programs =
+        (cl_program *)malloc((1 + numLines) * sizeof(cl_program));
 
-    if (my_programs == NULL) {
-        log_error( "ERROR: Unable to allocate my_programs array with %d programs! (in %s:%d)\n", (1+numLines), __FILE__, __LINE__);
+    if (my_programs == NULL)
+    {
+        log_error("ERROR: Unable to allocate my_programs array with %d "
+                  "programs! (in %s:%d)\n",
+                  (1 + numLines), __FILE__, __LINE__);
         return -1;
     }
     /* First, allocate the array for our line pointers */
-    lines = (const char **)malloc( (2*numLines + 2) * sizeof( const char * ) );
-    if (lines == NULL) {
-        log_error( "ERROR: Unable to allocate lines array with %d lines! (in %s:%d)\n", (2*numLines + 2), __FILE__, __LINE__);
+    lines = (const char **)malloc((2 * numLines + 2) * sizeof(const char *));
+    if (lines == NULL)
+    {
+        log_error(
+            "ERROR: Unable to allocate lines array with %d lines! (in %s:%d)\n",
+            (2 * numLines + 2), __FILE__, __LINE__);
         return -1;
     }
 
-    for(i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
         sprintf(buffer, composite_kernel_extern_template, i);
         lines[i] = _strdup(buffer);
     }
     /* First and last lines are easy */
-    lines[ numLines ] = composite_kernel_start;
-    lines[ 2* numLines + 1] = composite_kernel_end;
+    lines[numLines] = composite_kernel_start;
+    lines[2 * numLines + 1] = composite_kernel_end;
 
     /* Fill the rest with templated kernels */
-    for(i = numLines + 1; i < 2*numLines + 1; i++ )
+    for (i = numLines + 1; i < 2 * numLines + 1; i++)
     {
         sprintf(buffer, composite_kernel_template, i - numLines - 1);
-        lines[ i ] = _strdup(buffer);
+        lines[i] = _strdup(buffer);
     }
 
     /* Try to create a program with these lines */
-    error = create_single_kernel_helper_create_program(context, &my_programs[0], 2 * numLines + 2, lines);
-    if( my_programs[0] == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &my_programs[0],
+                                                       2 * numLines + 2, lines);
+    if (my_programs[0] == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create long test program with %d lines! (%s in %s:%d)\n", numLines, IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create long test program with %d lines! "
+                  "(%s in %s:%d)\n",
+                  numLines, IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     /* Compile it */
-    error = clCompileProgram(my_programs[0], 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(my_programs[0], 1, &deviceID, NULL, 0, NULL, NULL,
+                             NULL, NULL);
+    test_error(error, "Unable to compile a simple program");
 
     /* Create and compile templated kernels */
-    for( i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
         sprintf(buffer, simple_kernel_template, i);
-        const char* kernel_source = _strdup(buffer);
-        error = create_single_kernel_helper_create_program(context, &my_programs[i + 1], 1, &kernel_source);
-        if( my_programs[i+1] == NULL || error != CL_SUCCESS )
+        const char *kernel_source = _strdup(buffer);
+        error = create_single_kernel_helper_create_program(
+            context, &my_programs[i + 1], 1, &kernel_source);
+        if (my_programs[i + 1] == NULL || error != CL_SUCCESS)
         {
-            log_error( "ERROR: Unable to create long test program with %d lines! (%s in %s:%d)\n", numLines, IGetErrorString( error ), __FILE__, __LINE__ );
+            log_error("ERROR: Unable to create long test program with %d "
+                      "lines! (%s in %s:%d)\n",
+                      numLines, IGetErrorString(error), __FILE__, __LINE__);
             return -1;
         }
 
         /* Compile it */
-        error = clCompileProgram(my_programs[i+1], 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-        test_error( error, "Unable to compile a simple program" );
+        error = clCompileProgram(my_programs[i + 1], 1, &deviceID, NULL, 0,
+                                 NULL, NULL, NULL, NULL);
+        test_error(error, "Unable to compile a simple program");
 
-        free((void*)kernel_source);
+        free((void *)kernel_source);
     }
 
-    /* Link the program that calls the kernels and the library that contains them */
-    cl_program my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, 1+numLines, my_programs, NULL, NULL, &error);
-    test_error( error, "Unable to link a program with a library" );
+    /* Link the program that calls the kernels and the library that contains
+     * them */
+    cl_program my_newly_linked_program =
+        clLinkProgram(context, 1, &deviceID, NULL, 1 + numLines, my_programs,
+                      NULL, NULL, &error);
+    test_error(error, "Unable to link a program with a library");
 
     // Create the composite kernel
-    cl_kernel kernel = clCreateKernel(my_newly_linked_program, "CompositeKernel", &error);
-    test_error( error, "Unable to create a composite kernel" );
+    cl_kernel kernel =
+        clCreateKernel(my_newly_linked_program, "CompositeKernel", &error);
+    test_error(error, "Unable to create a composite kernel");
 
     // Run the composite kernel and verify the results
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    for(i = 0; i < 1+numLines; i++) {
-        error = clReleaseProgram( my_programs[i] );
-        test_error( error, "Unable to release program object" );
-    }
-    free( my_programs );
-    for(i = 0; i < numLines; i++)
+    for (i = 0; i < 1 + numLines; i++)
     {
-        free( (void*)lines[i] );
+        error = clReleaseProgram(my_programs[i]);
+        test_error(error, "Unable to release program object");
     }
-    for(i = numLines + 1; i < 2*numLines + 1; i++ )
+    free(my_programs);
+    for (i = 0; i < numLines; i++)
     {
-        free( (void*)lines[i] );
+        free((void *)lines[i]);
     }
-    free( lines );
+    for (i = numLines + 1; i < 2 * numLines + 1; i++)
+    {
+        free((void *)lines[i]);
+    }
+    free(lines);
 
-    error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
 
-    error = clReleaseProgram( my_newly_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_multiple_files(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_multiple_files(cl_device_id deviceID, cl_context context,
+                        cl_command_queue queue, int num_elements)
 {
-    unsigned int toTest[] = { 8, 32, 128, 256, 0 }; // 512, 2048, 8192, 32768, 0 };
+    unsigned int toTest[] = { 8, 32, 128, 256,
+                              0 }; // 512, 2048, 8192, 32768, 0 };
     unsigned int i;
 
-    log_info( "Testing multiple files compilation and linking into a single executable ...this might take awhile...\n" );
+    log_info("Testing multiple files compilation and linking into a single "
+             "executable ...this might take awhile...\n");
 
-    for( i = 0; toTest[ i ] != 0; i++ )
+    for (i = 0; toTest[i] != 0; i++)
     {
-        log_info( "   %d...\n", toTest[ i ] );
+        log_info("   %d...\n", toTest[i]);
 
 #if defined(_WIN32)
         clock_t start = clock();
-#elif  defined(__linux__) || defined(__APPLE__)
-    timeval time1, time2;
-    gettimeofday(&time1, NULL);
+#elif defined(__linux__) || defined(__APPLE__)
+        timeval time1, time2;
+        gettimeofday(&time1, NULL);
 #endif
 
-        if( test_large_multiple_files( context, deviceID, queue, toTest[ i ] ) != 0 )
+        if (test_large_multiple_files(context, deviceID, queue, toTest[i]) != 0)
         {
-            log_error( "ERROR: multiple files program test failed for %d lines! (in %s:%d)\n\n", toTest[ i ], __FILE__, __LINE__ );
+            log_error("ERROR: multiple files program test failed for %d lines! "
+                      "(in %s:%d)\n\n",
+                      toTest[i], __FILE__, __LINE__);
             return -1;
         }
 
 #if defined(_WIN32)
         clock_t end = clock();
-    log_perf( (float)( end - start ) / (float)CLOCKS_PER_SEC, false, "clock() time in secs", "%d lines", toTest[i] );
-#elif  defined(__linux__) || defined(__APPLE__)
-    gettimeofday(&time2, NULL);
-    log_perf( (float)(float)(time2.tv_sec  - time1.tv_sec) + 1.0e-6 * (time2.tv_usec - time1.tv_usec) , false, "wall time in secs", "%d lines", toTest[i] );
+        log_perf((float)(end - start) / (float)CLOCKS_PER_SEC, false,
+                 "clock() time in secs", "%d lines", toTest[i]);
+#elif defined(__linux__) || defined(__APPLE__)
+        gettimeofday(&time2, NULL);
+        log_perf((float)(float)(time2.tv_sec - time1.tv_sec)
+                     + 1.0e-6 * (time2.tv_usec - time1.tv_usec),
+                 false, "wall time in secs", "%d lines", toTest[i]);
 #endif
     }
 
     return 0;
 }
 
-int test_simple_compile_only(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_simple_compile_only(cl_device_id deviceID, cl_context context,
+                             cl_command_queue queue, int num_elements)
 {
     int error;
     cl_program program;
 
     log_info("Testing a simple compilation only...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple program");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_simple_static_compile_only(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_simple_static_compile_only(cl_device_id deviceID, cl_context context,
+                                    cl_command_queue queue, int num_elements)
 {
     int error;
     cl_program program;
 
     log_info("Testing a simple static compilations only...\n");
 
-    error = create_single_kernel_helper_create_program(context, &program, 1, &compile_static_var);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &compile_static_var);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple static variable test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create a simple static variable test "
+                  "program! (%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     log_info("Compiling a static variable...\n");
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple static variable program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple static variable program");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = create_single_kernel_helper_create_program(context, &program, 1, &compile_static_struct);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &compile_static_struct);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple static struct test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create a simple static struct test "
+                  "program! (%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     log_info("Compiling a static struct...\n");
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple static variable program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple static variable program");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = create_single_kernel_helper_create_program(context, &program, 1, &compile_static_function);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(
+        context, &program, 1, &compile_static_function);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple static function test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create a simple static function test "
+                  "program! (%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     log_info("Compiling a static function...\n");
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple static function program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple static function program");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_simple_extern_compile_only(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_simple_extern_compile_only(cl_device_id deviceID, cl_context context,
+                                    cl_command_queue queue, int num_elements)
 {
     int error;
     cl_program program;
 
     log_info("Testing a simple extern compilations only...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_header);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_header);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple extern kernel test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create a simple extern kernel test "
+                  "program! (%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     log_info("Compiling an extern kernel...\n");
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple extern kernel program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple extern kernel program");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = create_single_kernel_helper_create_program(context, &program, 1, &compile_extern_var);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &compile_extern_var);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple extern variable test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create a simple extern variable test "
+                  "program! (%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     log_info("Compiling an extern variable...\n");
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple extern variable program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple extern variable program");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = create_single_kernel_helper_create_program(context, &program, 1, &compile_extern_struct);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &compile_extern_struct);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple extern struct test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create a simple extern struct test "
+                  "program! (%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     log_info("Compiling an extern struct...\n");
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple extern variable program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple extern variable program");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = create_single_kernel_helper_create_program(context, &program, 1, &compile_extern_function);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(
+        context, &program, 1, &compile_extern_function);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple extern function test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create a simple extern function test "
+                  "program! (%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     log_info("Compiling an extern function...\n");
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple extern function program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple extern function program");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-struct simple_user_data {
-    const char*        m_message;
-    cl_event        m_event;
+struct simple_user_data
+{
+    const char *m_message;
+    cl_event m_event;
 };
 
-const char* once_upon_a_midnight_dreary = "Once upon a midnight dreary!";
+const char *once_upon_a_midnight_dreary = "Once upon a midnight dreary!";
 
-static void CL_CALLBACK simple_compile_callback(cl_program program, void* user_data)
+static void CL_CALLBACK simple_compile_callback(cl_program program,
+                                                void *user_data)
 {
-    simple_user_data* simple_compile_user_data = (simple_user_data*)user_data;
-    log_info("in the simple_compile_callback: program %p just completed compiling with '%s'\n", program, simple_compile_user_data->m_message);
-    if (strcmp(once_upon_a_midnight_dreary, simple_compile_user_data->m_message) != 0)
+    simple_user_data *simple_compile_user_data = (simple_user_data *)user_data;
+    log_info("in the simple_compile_callback: program %p just completed "
+             "compiling with '%s'\n",
+             program, simple_compile_user_data->m_message);
+    if (strcmp(once_upon_a_midnight_dreary, simple_compile_user_data->m_message)
+        != 0)
     {
-        log_error("ERROR: in the simple_compile_callback: Expected '%s' and got %s (in %s:%d)!\n", once_upon_a_midnight_dreary, simple_compile_user_data->m_message, __FILE__, __LINE__);
+        log_error("ERROR: in the simple_compile_callback: Expected '%s' and "
+                  "got %s (in %s:%d)!\n",
+                  once_upon_a_midnight_dreary,
+                  simple_compile_user_data->m_message, __FILE__, __LINE__);
     }
 
     int error;
-    log_info("in the simple_compile_callback: program %p just completed compiling with '%p'\n", program, simple_compile_user_data->m_event);
+    log_info("in the simple_compile_callback: program %p just completed "
+             "compiling with '%p'\n",
+             program, simple_compile_user_data->m_event);
 
-    error = clSetUserEventStatus(simple_compile_user_data->m_event, CL_COMPLETE);
+    error =
+        clSetUserEventStatus(simple_compile_user_data->m_event, CL_COMPLETE);
     if (error != CL_SUCCESS)
     {
-        log_error( "ERROR: in the simple_compile_callback: Unable to set user event status to CL_COMPLETE! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: in the simple_compile_callback: Unable to set user "
+                  "event status to CL_COMPLETE! (%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         exit(-1);
     }
-    log_info("in the simple_compile_callback: Successfully signaled compile_program_completion_event!\n");
+    log_info("in the simple_compile_callback: Successfully signaled "
+             "compile_program_completion_event!\n");
 }
 
-int test_simple_compile_with_callback(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_simple_compile_with_callback(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue, int num_elements)
 {
     int error;
     cl_program program;
     cl_event compile_program_completion_event;
 
     log_info("Testing a simple compilation with callback...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     compile_program_completion_event = clCreateUserEvent(context, &error);
-    test_error( error, "Unable to create a user event");
+    test_error(error, "Unable to create a user event");
 
-    simple_user_data simple_compile_user_data = {once_upon_a_midnight_dreary, compile_program_completion_event};
+    simple_user_data simple_compile_user_data = {
+        once_upon_a_midnight_dreary, compile_program_completion_event
+    };
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, simple_compile_callback, (void*)&simple_compile_user_data);
-    test_error( error, "Unable to compile a simple program with a callback" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL,
+                             simple_compile_callback,
+                             (void *)&simple_compile_user_data);
+    test_error(error, "Unable to compile a simple program with a callback");
 
     error = clWaitForEvents(1, &compile_program_completion_event);
-    test_error( error, "clWaitForEvents failed when waiting on compile_program_completion_event");
+    test_error(error,
+               "clWaitForEvents failed when waiting on "
+               "compile_program_completion_event");
 
     /* All done! */
     error = clReleaseEvent(compile_program_completion_event);
-    test_error( error, "Unable to release event object" );
+    test_error(error, "Unable to release event object");
 
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_simple_embedded_header_compile(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_simple_embedded_header_compile(cl_device_id deviceID,
+                                        cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements)
 {
     int error;
     cl_program program, header;
 
     log_info("Testing a simple embedded header compile only...\n");
-    program = clCreateProgramWithSource(context, 1, &another_simple_kernel_with_header, NULL, &error);
-    if( program == NULL || error != CL_SUCCESS )
+    program = clCreateProgramWithSource(
+        context, 1, &another_simple_kernel_with_header, NULL, &error);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    header = clCreateProgramWithSource(context, 1, &simple_header, NULL, &error);
-    if( header == NULL || error != CL_SUCCESS )
+    header =
+        clCreateProgramWithSource(context, 1, &simple_header, NULL, &error);
+    if (header == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple header program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple header program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 1, &header, &simple_header_name, NULL, NULL);
-    test_error( error, "Unable to compile a simple program with embedded header" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 1, &header,
+                             &simple_header_name, NULL, NULL);
+    test_error(error,
+               "Unable to compile a simple program with embedded header");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( header );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(header);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_simple_link_only(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_simple_link_only(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
 {
     int error;
     cl_program program;
 
     log_info("Testing a simple linking only...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple program");
 
-    cl_program my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, 1, &program, NULL, NULL, &error);
-    test_error( error, "Unable to link a simple program" );
+    cl_program my_newly_linked_program = clLinkProgram(
+        context, 1, &deviceID, NULL, 1, &program, NULL, NULL, &error);
+    test_error(error, "Unable to link a simple program");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_two_file_regular_variable_access(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_two_file_regular_variable_access(cl_device_id deviceID,
+                                          cl_context context,
+                                          cl_command_queue queue,
+                                          int num_elements)
 {
     int error;
     cl_program program, second_program, my_newly_linked_program;
 
-    const char* sources[2] = {simple_kernel, compile_regular_var}; // here we want to avoid linking error due to lack of kernels
-    log_info("Compiling and linking two program objects, where one tries to access regular variable from another...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 2, sources);
-    if( program == NULL || error != CL_SUCCESS )
+    const char *sources[2] = {
+        simple_kernel, compile_regular_var
+    }; // here we want to avoid linking error due to lack of kernels
+    log_info("Compiling and linking two program objects, where one tries to "
+             "access regular variable from another...\n");
+    error = create_single_kernel_helper_create_program(context, &program, 2,
+                                                       sources);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a test program with regular variable! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create a test program with regular "
+                  "variable! (%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program with regular function" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error,
+               "Unable to compile a simple program with regular function");
 
-    error = create_single_kernel_helper_create_program(context, &second_program, 1, &link_static_var_access);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(
+        context, &second_program, 1, &link_static_var_access);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a test program that tries to access a regular variable! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create a test program that tries to access "
+                  "a regular variable! (%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(second_program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a program that tries to access a regular variable" );
+    error = clCompileProgram(second_program, 1, &deviceID, NULL, 0, NULL, NULL,
+                             NULL, NULL);
+    test_error(
+        error,
+        "Unable to compile a program that tries to access a regular variable");
 
     cl_program two_programs[2] = { program, second_program };
-    my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, 2, two_programs, NULL, NULL, &error);
-    test_error( error, "clLinkProgram: Expected a different error code while linking a program that tries to access a regular variable" );
+    my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, 2,
+                                            two_programs, NULL, NULL, &error);
+    test_error(error,
+               "clLinkProgram: Expected a different error code while linking a "
+               "program that tries to access a regular variable");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( second_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(second_program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_two_file_regular_struct_access(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_two_file_regular_struct_access(cl_device_id deviceID,
+                                        cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements)
 {
     int error;
     cl_program program, second_program, my_newly_linked_program;
 
-    const char* sources[2] = {simple_kernel, compile_regular_struct}; // here we want to avoid linking error due to lack of kernels
-    log_info("Compiling and linking two program objects, where one tries to access regular struct from another...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 2, sources);
-    if( program == NULL || error != CL_SUCCESS )
+    const char *sources[2] = {
+        simple_kernel, compile_regular_struct
+    }; // here we want to avoid linking error due to lack of kernels
+    log_info("Compiling and linking two program objects, where one tries to "
+             "access regular struct from another...\n");
+    error = create_single_kernel_helper_create_program(context, &program, 2,
+                                                       sources);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a test program with regular struct! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create a test program with regular struct! "
+                  "(%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program with regular struct" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple program with regular struct");
 
-    error = create_single_kernel_helper_create_program(context, &second_program, 1, &link_static_struct_access);
-    if( second_program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(
+        context, &second_program, 1, &link_static_struct_access);
+    if (second_program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a test program that tries to access a regular struct! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create a test program that tries to access "
+                  "a regular struct! (%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(second_program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a program that tries to access a regular struct" );
+    error = clCompileProgram(second_program, 1, &deviceID, NULL, 0, NULL, NULL,
+                             NULL, NULL);
+    test_error(
+        error,
+        "Unable to compile a program that tries to access a regular struct");
 
     cl_program two_programs[2] = { program, second_program };
-    my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, 2, two_programs, NULL, NULL, &error);
-    test_error( error, "clLinkProgram: Expected a different error code while linking a program that tries to access a regular struct" );
+    my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, 2,
+                                            two_programs, NULL, NULL, &error);
+    test_error(error,
+               "clLinkProgram: Expected a different error code while linking a "
+               "program that tries to access a regular struct");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( second_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(second_program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
 
-int test_two_file_regular_function_access(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_two_file_regular_function_access(cl_device_id deviceID,
+                                          cl_context context,
+                                          cl_command_queue queue,
+                                          int num_elements)
 {
     int error;
     cl_program program, second_program, my_newly_linked_program;
 
-    const char* sources[2] = {simple_kernel, compile_regular_function}; // here we want to avoid linking error due to lack of kernels
-    log_info("Compiling and linking two program objects, where one tries to access regular function from another...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 2, sources);
-    if( program == NULL || error != CL_SUCCESS )
+    const char *sources[2] = {
+        simple_kernel, compile_regular_function
+    }; // here we want to avoid linking error due to lack of kernels
+    log_info("Compiling and linking two program objects, where one tries to "
+             "access regular function from another...\n");
+    error = create_single_kernel_helper_create_program(context, &program, 2,
+                                                       sources);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a test program with regular function! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create a test program with regular "
+                  "function! (%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program with regular function" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error,
+               "Unable to compile a simple program with regular function");
 
-    error = create_single_kernel_helper_create_program(context, &second_program, 1, &link_static_function_access);
-    if( second_program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(
+        context, &second_program, 1, &link_static_function_access);
+    if (second_program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a test program that tries to access a regular function! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create a test program that tries to access "
+                  "a regular function! (%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(second_program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a program that tries to access a regular function" );
+    error = clCompileProgram(second_program, 1, &deviceID, NULL, 0, NULL, NULL,
+                             NULL, NULL);
+    test_error(
+        error,
+        "Unable to compile a program that tries to access a regular function");
 
     cl_program two_programs[2] = { program, second_program };
-    my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, 2, two_programs, NULL, NULL, &error);
-    test_error( error, "clLinkProgram: Expected a different error code while linking a program that tries to access a regular function" );
+    my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, 2,
+                                            two_programs, NULL, NULL, &error);
+    test_error(error,
+               "clLinkProgram: Expected a different error code while linking a "
+               "program that tries to access a regular function");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( second_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(second_program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_simple_embedded_header_link(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_simple_embedded_header_link(cl_device_id deviceID, cl_context context,
+                                     cl_command_queue queue, int num_elements)
 {
     int error;
     cl_program program, header, simple_program;
 
     log_info("Testing a simple embedded header link...\n");
-    program = clCreateProgramWithSource(context, 1, &another_simple_kernel_with_header, NULL, &error);
-    if( program == NULL || error != CL_SUCCESS )
+    program = clCreateProgramWithSource(
+        context, 1, &another_simple_kernel_with_header, NULL, &error);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    header = clCreateProgramWithSource(context, 1, &simple_header, NULL, &error);
-    if( header == NULL || error != CL_SUCCESS )
+    header =
+        clCreateProgramWithSource(context, 1, &simple_header, NULL, &error);
+    if (header == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple header program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple header program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 1, &header, &simple_header_name, NULL, NULL);
-    test_error( error, "Unable to compile a simple program with embedded header" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 1, &header,
+                             &simple_header_name, NULL, NULL);
+    test_error(error,
+               "Unable to compile a simple program with embedded header");
 
-    error = create_single_kernel_helper_create_program(context, &simple_program, 1, &simple_kernel);
-    if( simple_program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &simple_program,
+                                                       1, &simple_kernel);
+    if (simple_program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(simple_program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(simple_program, 1, &deviceID, NULL, 0, NULL, NULL,
+                             NULL, NULL);
+    test_error(error, "Unable to compile a simple program");
 
     cl_program two_programs[2] = { program, simple_program };
-    cl_program fully_linked_program = clLinkProgram(context, 1, &deviceID, "", 2, two_programs, NULL, NULL, &error);
-    test_error( error, "Unable to create an executable from two binaries, one compiled with embedded header" );
+    cl_program fully_linked_program = clLinkProgram(
+        context, 1, &deviceID, "", 2, two_programs, NULL, NULL, &error);
+    test_error(error,
+               "Unable to create an executable from two binaries, one compiled "
+               "with embedded header");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( header );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(header);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( simple_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(simple_program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( fully_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(fully_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-const char* when_i_pondered_weak_and_weary = "When I pondered weak and weary!";
+const char *when_i_pondered_weak_and_weary = "When I pondered weak and weary!";
 
-static void CL_CALLBACK simple_link_callback(cl_program program, void* user_data)
+static void CL_CALLBACK simple_link_callback(cl_program program,
+                                             void *user_data)
 {
-    simple_user_data* simple_link_user_data = (simple_user_data*)user_data;
-    log_info("in the simple_link_callback: program %p just completed linking with '%s'\n", program, (const char*)simple_link_user_data->m_message);
-    if (strcmp(when_i_pondered_weak_and_weary, simple_link_user_data->m_message) != 0)
+    simple_user_data *simple_link_user_data = (simple_user_data *)user_data;
+    log_info("in the simple_link_callback: program %p just completed linking "
+             "with '%s'\n",
+             program, (const char *)simple_link_user_data->m_message);
+    if (strcmp(when_i_pondered_weak_and_weary, simple_link_user_data->m_message)
+        != 0)
     {
-        log_error("ERROR: in the simple_compile_callback: Expected '%s' and got %s! (in %s:%d)\n", when_i_pondered_weak_and_weary, simple_link_user_data->m_message, __FILE__, __LINE__);
+        log_error("ERROR: in the simple_compile_callback: Expected '%s' and "
+                  "got %s! (in %s:%d)\n",
+                  when_i_pondered_weak_and_weary,
+                  simple_link_user_data->m_message, __FILE__, __LINE__);
     }
 
     int error;
-    log_info("in the simple_link_callback: program %p just completed linking with '%p'\n", program, simple_link_user_data->m_event);
+    log_info("in the simple_link_callback: program %p just completed linking "
+             "with '%p'\n",
+             program, simple_link_user_data->m_event);
 
     error = clSetUserEventStatus(simple_link_user_data->m_event, CL_COMPLETE);
     if (error != CL_SUCCESS)
     {
-        log_error( "ERROR: simple_link_callback: Unable to set user event status to CL_COMPLETE! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: simple_link_callback: Unable to set user event "
+                  "status to CL_COMPLETE! (%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         exit(-1);
     }
-    log_info("in the simple_link_callback: Successfully signaled link_program_completion_event event!\n");
+    log_info("in the simple_link_callback: Successfully signaled "
+             "link_program_completion_event event!\n");
 }
 
-int test_simple_link_with_callback(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_simple_link_with_callback(cl_device_id deviceID, cl_context context,
+                                   cl_command_queue queue, int num_elements)
 {
     int error;
     cl_program program;
     cl_event link_program_completion_event;
 
     log_info("Testing a simple linking with callback...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple program");
 
     link_program_completion_event = clCreateUserEvent(context, &error);
-    test_error( error, "Unable to create a user event");
+    test_error(error, "Unable to create a user event");
 
-    simple_user_data simple_link_user_data = {when_i_pondered_weak_and_weary, link_program_completion_event};
+    simple_user_data simple_link_user_data = { when_i_pondered_weak_and_weary,
+                                               link_program_completion_event };
 
-    cl_program my_linked_library = clLinkProgram(context, 1, &deviceID, NULL, 1, &program, simple_link_callback, (void*)&simple_link_user_data, &error);
-    test_error( error, "Unable to link a simple program" );
+    cl_program my_linked_library = clLinkProgram(
+        context, 1, &deviceID, NULL, 1, &program, simple_link_callback,
+        (void *)&simple_link_user_data, &error);
+    test_error(error, "Unable to link a simple program");
 
     error = clWaitForEvents(1, &link_program_completion_event);
-    test_error( error, "clWaitForEvents failed when waiting on link_program_completion_event");
+    test_error(
+        error,
+        "clWaitForEvents failed when waiting on link_program_completion_event");
 
     /* All done! */
     error = clReleaseEvent(link_program_completion_event);
-    test_error( error, "Unable to release event object" );
+    test_error(error, "Unable to release event object");
 
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_linked_library );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_linked_library);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-static void initBuffer(float* & srcBuffer, unsigned int cnDimension)
+static void initBuffer(float *&srcBuffer, unsigned int cnDimension)
 {
     float num = 0.0f;
 
-    for( unsigned int i = 0; i < cnDimension; i++ )
+    for (unsigned int i = 0; i < cnDimension; i++)
     {
-        if( ( i % 10 ) == 0  )
+        if ((i % 10) == 0)
         {
             num = 0.0f;
         }
 
-        srcBuffer[ i ] = num;
+        srcBuffer[i] = num;
         num = num + 1.0f;
     }
 }
 
-static int verifyCopyBuffer(cl_context context, cl_command_queue queue, cl_kernel kernel)
+static int verifyCopyBuffer(cl_context context, cl_command_queue queue,
+                            cl_kernel kernel)
 {
     int error, result = CL_SUCCESS;
     const size_t cnDimension = 32;
 
     // Allocate source buffer
-    float * srcBuffer = (float*)malloc(cnDimension * sizeof(float));
-    float * dstBuffer = (float*)malloc(cnDimension * sizeof(float));
+    float *srcBuffer = (float *)malloc(cnDimension * sizeof(float));
+    float *dstBuffer = (float *)malloc(cnDimension * sizeof(float));
 
-    if (srcBuffer == NULL) {
-        log_error( "ERROR: Unable to allocate srcBuffer float array with %lu floats! (in %s:%d)\n", cnDimension, __FILE__, __LINE__);
+    if (srcBuffer == NULL)
+    {
+        log_error("ERROR: Unable to allocate srcBuffer float array with %lu "
+                  "floats! (in %s:%d)\n",
+                  cnDimension, __FILE__, __LINE__);
         return -1;
     }
-    if (dstBuffer == NULL) {
-        log_error( "ERROR: Unable to allocate dstBuffer float array with %lu floats! (in %s:%d)\n", cnDimension, __FILE__, __LINE__);
+    if (dstBuffer == NULL)
+    {
+        log_error("ERROR: Unable to allocate dstBuffer float array with %lu "
+                  "floats! (in %s:%d)\n",
+                  cnDimension, __FILE__, __LINE__);
         return -1;
     }
 
-    if( srcBuffer && dstBuffer )
+    if (srcBuffer && dstBuffer)
     {
         // initialize host memory
-        initBuffer(srcBuffer, cnDimension );
+        initBuffer(srcBuffer, cnDimension);
 
         // Allocate device memory
-        cl_mem deviceMemSrc = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
-                cnDimension * sizeof( cl_float ), srcBuffer, &error);
-        test_error( error, "Unable to create a source memory buffer" );
+        cl_mem deviceMemSrc =
+            clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                           cnDimension * sizeof(cl_float), srcBuffer, &error);
+        test_error(error, "Unable to create a source memory buffer");
 
-        cl_mem deviceMemDst = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
-                cnDimension * sizeof( cl_float ), 0, &error);
-        test_error( error, "Unable to create a destination memory buffer" );
+        cl_mem deviceMemDst =
+            clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                           cnDimension * sizeof(cl_float), 0, &error);
+        test_error(error, "Unable to create a destination memory buffer");
 
         // Set kernel args
         // Set parameter 0 to be the source buffer
-        error = clSetKernelArg(kernel, 0, sizeof( cl_mem ), ( void * )&deviceMemSrc );
-        test_error( error, "Unable to set the first kernel argument" );
+        error =
+            clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&deviceMemSrc);
+        test_error(error, "Unable to set the first kernel argument");
 
         // Set parameter 1 to be the destination buffer
-        error = clSetKernelArg(kernel, 1, sizeof( cl_mem ), ( void * )&deviceMemDst );
-        test_error( error, "Unable to set the second kernel argument" );
+        error =
+            clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&deviceMemDst);
+        test_error(error, "Unable to set the second kernel argument");
 
         // Execute kernel
-        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
-            &cnDimension, 0, 0, NULL, NULL );
-        test_error( error, "Unable to enqueue kernel" );
+        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &cnDimension, 0,
+                                       0, NULL, NULL);
+        test_error(error, "Unable to enqueue kernel");
 
-        error = clFlush( queue );
-        test_error( error, "Unable to flush the queue" );
+        error = clFlush(queue);
+        test_error(error, "Unable to flush the queue");
 
         // copy results from device back to host
-        error = clEnqueueReadBuffer(queue, deviceMemDst, CL_TRUE, 0, cnDimension * sizeof( cl_float ),
-            dstBuffer, 0, NULL,    NULL );
-        test_error( error, "Unable to read the destination buffer" );
+        error = clEnqueueReadBuffer(queue, deviceMemDst, CL_TRUE, 0,
+                                    cnDimension * sizeof(cl_float), dstBuffer,
+                                    0, NULL, NULL);
+        test_error(error, "Unable to read the destination buffer");
 
-        error = clFlush( queue );
-        test_error( error, "Unable to flush the queue" );
+        error = clFlush(queue);
+        test_error(error, "Unable to flush the queue");
 
         // Compare the source and destination buffers
-        const int* pSrc = (int*)srcBuffer;
-        const int* pDst = (int*)dstBuffer;
+        const int *pSrc = (int *)srcBuffer;
+        const int *pDst = (int *)dstBuffer;
         int mismatch = 0;
 
-        for( size_t i = 0; i < cnDimension; i++ )
+        for (size_t i = 0; i < cnDimension; i++)
         {
-            if( pSrc[i] != pDst[i] )
+            if (pSrc[i] != pDst[i])
             {
-                if( mismatch < 4 )
+                if (mismatch < 4)
                 {
-                    log_info("Offset %08lX:  Expected %08X, Got %08X\n", i * 4, pSrc[i],    pDst[i] );
+                    log_info("Offset %08lX:  Expected %08X, Got %08X\n", i * 4,
+                             pSrc[i], pDst[i]);
                 }
                 else
                 {
@@ -1700,9 +2078,9 @@
             }
         }
 
-        if( mismatch )
+        if (mismatch)
         {
-            log_info("*** %d mismatches found, TEST FAILS! ***\n", mismatch );
+            log_info("*** %d mismatches found, TEST FAILS! ***\n", mismatch);
             result = -1;
         }
         else
@@ -1710,806 +2088,989 @@
             log_info("Buffers match, test passes.\n");
         }
 
-        free( srcBuffer );
+        free(srcBuffer);
         srcBuffer = NULL;
-        free( dstBuffer );
+        free(dstBuffer);
         dstBuffer = NULL;
 
-        if( deviceMemSrc )
+        if (deviceMemSrc)
         {
-            error = clReleaseMemObject( deviceMemSrc );
-            test_error( error, "Unable to release memory object" );
+            error = clReleaseMemObject(deviceMemSrc);
+            test_error(error, "Unable to release memory object");
         }
 
-        if( deviceMemDst )
+        if (deviceMemDst)
         {
-            error = clReleaseMemObject( deviceMemDst );
-            test_error( error, "Unable to release memory object" );
+            error = clReleaseMemObject(deviceMemDst);
+            test_error(error, "Unable to release memory object");
         }
     }
     return result;
 }
 
-int test_execute_after_simple_compile_and_link(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_execute_after_simple_compile_and_link(cl_device_id deviceID,
+                                               cl_context context,
+                                               cl_command_queue queue,
+                                               int num_elements)
 {
     int error;
     cl_program program;
 
     log_info("Testing execution after a simple compile and link...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple program");
 
-    cl_program my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, 1, &program, NULL, NULL, &error);
-    test_error( error, "Unable to link a simple program" );
+    cl_program my_newly_linked_program = clLinkProgram(
+        context, 1, &deviceID, NULL, 1, &program, NULL, NULL, &error);
+    test_error(error, "Unable to link a simple program");
 
-    cl_kernel kernel = clCreateKernel(my_newly_linked_program, "CopyBuffer", &error);
-    test_error( error, "Unable to create a simple kernel" );
+    cl_kernel kernel =
+        clCreateKernel(my_newly_linked_program, "CopyBuffer", &error);
+    test_error(error, "Unable to create a simple kernel");
 
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
 
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_execute_after_simple_compile_and_link_no_device_info(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_execute_after_simple_compile_and_link_no_device_info(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
 {
     int error;
     cl_program program;
 
-    log_info("Testing execution after a simple compile and link with no device information provided...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    log_info("Testing execution after a simple compile and link with no device "
+             "information provided...\n");
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     error = clCompileProgram(program, 0, NULL, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    test_error(error, "Unable to compile a simple program");
 
-    cl_program my_newly_linked_program = clLinkProgram(context, 0, NULL, NULL, 1, &program, NULL, NULL, &error);
-    test_error( error, "Unable to link a simple program" );
+    cl_program my_newly_linked_program =
+        clLinkProgram(context, 0, NULL, NULL, 1, &program, NULL, NULL, &error);
+    test_error(error, "Unable to link a simple program");
 
-    cl_kernel kernel = clCreateKernel(my_newly_linked_program, "CopyBuffer", &error);
-    test_error( error, "Unable to create a simple kernel" );
+    cl_kernel kernel =
+        clCreateKernel(my_newly_linked_program, "CopyBuffer", &error);
+    test_error(error, "Unable to create a simple kernel");
 
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
 
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_execute_after_simple_compile_and_link_with_defines(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_execute_after_simple_compile_and_link_with_defines(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
 {
     int error;
     cl_program program;
 
-    log_info("Testing execution after a simple compile and link with defines...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel_with_defines, "-DFIRST=5 -DSECOND=37");
-    if( program == NULL || error != CL_SUCCESS )
+    log_info(
+        "Testing execution after a simple compile and link with defines...\n");
+    error = create_single_kernel_helper_create_program(
+        context, &program, 1, &simple_kernel_with_defines,
+        "-DFIRST=5 -DSECOND=37");
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, "-DFIRST=5 -DSECOND=37", 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(program, 1, &deviceID, "-DFIRST=5 -DSECOND=37", 0,
+                             NULL, NULL, NULL, NULL);
+    test_error(error, "Unable to compile a simple program");
 
-    cl_program my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, 1, &program, NULL, NULL, &error);
-    test_error( error, "Unable to link a simple program" );
+    cl_program my_newly_linked_program = clLinkProgram(
+        context, 1, &deviceID, NULL, 1, &program, NULL, NULL, &error);
+    test_error(error, "Unable to link a simple program");
 
-    cl_kernel kernel = clCreateKernel(my_newly_linked_program, "CopyBuffer", &error);
-    test_error( error, "Unable to create a simple kernel" );
+    cl_kernel kernel =
+        clCreateKernel(my_newly_linked_program, "CopyBuffer", &error);
+    test_error(error, "Unable to create a simple kernel");
 
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
 
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_execute_after_serialize_reload_object(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_execute_after_serialize_reload_object(cl_device_id deviceID,
+                                               cl_context context,
+                                               cl_command_queue queue,
+                                               int num_elements)
 {
     int error;
     cl_program program;
-    size_t            binarySize;
+    size_t binarySize;
     unsigned char *binary;
 
-    log_info("Testing execution after serialization and reloading of the object...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    log_info("Testing execution after serialization and reloading of the "
+             "object...\n");
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple program");
 
     // Get the size of the resulting binary (only one device)
-    error = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof( binarySize ), &binarySize, NULL );
-    test_error( error, "Unable to get binary size" );
+    error = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
+                             sizeof(binarySize), &binarySize, NULL);
+    test_error(error, "Unable to get binary size");
 
     // Sanity check
-    if( binarySize == 0 )
+    if (binarySize == 0)
     {
-        log_error( "ERROR: Binary size of program is zero (in %s:%d)\n", __FILE__, __LINE__ );
+        log_error("ERROR: Binary size of program is zero (in %s:%d)\n",
+                  __FILE__, __LINE__);
         return -1;
     }
 
     // Create a buffer and get the actual binary
-    binary = (unsigned char*)malloc(sizeof(unsigned char)*binarySize);
-    if (binary == NULL) {
-        log_error( "ERROR: Unable to allocate binary character array with %lu characters! (in %s:%d)\n", binarySize, __FILE__, __LINE__ );
+    binary = (unsigned char *)malloc(sizeof(unsigned char) * binarySize);
+    if (binary == NULL)
+    {
+        log_error("ERROR: Unable to allocate binary character array with %lu "
+                  "characters! (in %s:%d)\n",
+                  binarySize, __FILE__, __LINE__);
         return -1;
     }
 
-    unsigned char *buffers[ 1 ] = { binary };
-    cl_int loadErrors[ 1 ];
+    unsigned char *buffers[1] = { binary };
+    cl_int loadErrors[1];
 
     // Do another sanity check here first
     size_t size;
-    error = clGetProgramInfo( program, CL_PROGRAM_BINARIES, 0, NULL, &size );
-    test_error( error, "Unable to get expected size of binaries array" );
-    if( size != sizeof( buffers ) )
+    error = clGetProgramInfo(program, CL_PROGRAM_BINARIES, 0, NULL, &size);
+    test_error(error, "Unable to get expected size of binaries array");
+    if (size != sizeof(buffers))
     {
-        log_error( "ERROR: Expected size of binaries array in clGetProgramInfo is incorrect (should be %d, got %d) (in %s:%d)\n", (int)sizeof( buffers ), (int)size, __FILE__, __LINE__ );
+        log_error("ERROR: Expected size of binaries array in clGetProgramInfo "
+                  "is incorrect (should be %d, got %d) (in %s:%d)\n",
+                  (int)sizeof(buffers), (int)size, __FILE__, __LINE__);
         free(binary);
         return -1;
     }
 
-    error = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof( buffers ), &buffers, NULL );
-    test_error( error, "Unable to get program binary" );
+    error = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(buffers),
+                             &buffers, NULL);
+    test_error(error, "Unable to get program binary");
 
     // use clCreateProgramWithBinary
-    cl_program program_with_binary = clCreateProgramWithBinary(context, 1, &deviceID, &binarySize, (const unsigned char**)buffers, loadErrors, &error);
-    test_error( error, "Unable to create program with binary" );
+    cl_program program_with_binary = clCreateProgramWithBinary(
+        context, 1, &deviceID, &binarySize, (const unsigned char **)buffers,
+        loadErrors, &error);
+    test_error(error, "Unable to create program with binary");
 
-    cl_program my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, 1, &program_with_binary, NULL, NULL, &error);
-    test_error( error, "Unable to link a simple program" );
+    cl_program my_newly_linked_program =
+        clLinkProgram(context, 1, &deviceID, NULL, 1, &program_with_binary,
+                      NULL, NULL, &error);
+    test_error(error, "Unable to link a simple program");
 
-    cl_kernel kernel = clCreateKernel(my_newly_linked_program, "CopyBuffer", &error);
-    test_error( error, "Unable to create a simple kernel" );
+    cl_kernel kernel =
+        clCreateKernel(my_newly_linked_program, "CopyBuffer", &error);
+    test_error(error, "Unable to create a simple kernel");
 
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
 
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_linked_program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( program_with_binary );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program_with_binary);
+    test_error(error, "Unable to release program object");
 
     free(binary);
 
     return 0;
 }
 
-int test_execute_after_serialize_reload_library(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_execute_after_serialize_reload_library(cl_device_id deviceID,
+                                                cl_context context,
+                                                cl_command_queue queue,
+                                                int num_elements)
 {
     int error;
     cl_program program, another_program;
-    size_t            binarySize;
+    size_t binarySize;
     unsigned char *binary;
 
-    log_info("Testing execution after linking a binary with a simple library...\n");
+    log_info(
+        "Testing execution after linking a binary with a simple library...\n");
     // we will test creation of a simple library from one file
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple program");
 
-    cl_program my_newly_minted_library = clLinkProgram(context, 1, &deviceID, "-create-library", 1, &program, NULL, NULL, &error);
-    test_error( error, "Unable to create a simple library" );
+    cl_program my_newly_minted_library =
+        clLinkProgram(context, 1, &deviceID, "-create-library", 1, &program,
+                      NULL, NULL, &error);
+    test_error(error, "Unable to create a simple library");
 
 
     // Get the size of the resulting library (only one device)
-    error = clGetProgramInfo( my_newly_minted_library, CL_PROGRAM_BINARY_SIZES, sizeof( binarySize ), &binarySize, NULL );
-    test_error( error, "Unable to get binary size" );
+    error = clGetProgramInfo(my_newly_minted_library, CL_PROGRAM_BINARY_SIZES,
+                             sizeof(binarySize), &binarySize, NULL);
+    test_error(error, "Unable to get binary size");
 
     // Sanity check
-    if( binarySize == 0 )
+    if (binarySize == 0)
     {
-        log_error( "ERROR: Binary size of program is zero (in %s:%d)\n", __FILE__, __LINE__ );
+        log_error("ERROR: Binary size of program is zero (in %s:%d)\n",
+                  __FILE__, __LINE__);
         return -1;
     }
 
     // Create a buffer and get the actual binary
-    binary = (unsigned char*)malloc(sizeof(unsigned char)*binarySize);
-    if (binary == NULL) {
-        log_error( "ERROR: Unable to allocate binary character array with %lu characters (in %s:%d)!", binarySize, __FILE__, __LINE__);
+    binary = (unsigned char *)malloc(sizeof(unsigned char) * binarySize);
+    if (binary == NULL)
+    {
+        log_error("ERROR: Unable to allocate binary character array with %lu "
+                  "characters (in %s:%d)!",
+                  binarySize, __FILE__, __LINE__);
         return -1;
     }
-    unsigned char *buffers[ 1 ] = { binary };
-    cl_int loadErrors[ 1 ];
+    unsigned char *buffers[1] = { binary };
+    cl_int loadErrors[1];
 
     // Do another sanity check here first
     size_t size;
-    error = clGetProgramInfo( my_newly_minted_library, CL_PROGRAM_BINARIES, 0, NULL, &size );
-    test_error( error, "Unable to get expected size of binaries array" );
-    if( size != sizeof( buffers ) )
+    error = clGetProgramInfo(my_newly_minted_library, CL_PROGRAM_BINARIES, 0,
+                             NULL, &size);
+    test_error(error, "Unable to get expected size of binaries array");
+    if (size != sizeof(buffers))
     {
-        log_error( "ERROR: Expected size of binaries array in clGetProgramInfo is incorrect (should be %d, got %d) (in %s:%d)\n", (int)sizeof( buffers ), (int)size, __FILE__, __LINE__ );
+        log_error("ERROR: Expected size of binaries array in clGetProgramInfo "
+                  "is incorrect (should be %d, got %d) (in %s:%d)\n",
+                  (int)sizeof(buffers), (int)size, __FILE__, __LINE__);
         free(binary);
         return -1;
     }
 
-    error = clGetProgramInfo( my_newly_minted_library, CL_PROGRAM_BINARIES, sizeof( buffers ), &buffers, NULL );
-    test_error( error, "Unable to get program binary" );
+    error = clGetProgramInfo(my_newly_minted_library, CL_PROGRAM_BINARIES,
+                             sizeof(buffers), &buffers, NULL);
+    test_error(error, "Unable to get program binary");
 
     // use clCreateProgramWithBinary
-    cl_program library_with_binary = clCreateProgramWithBinary(context, 1, &deviceID, &binarySize, (const unsigned char**)buffers, loadErrors, &error);
-    test_error( error, "Unable to create program with binary" );
+    cl_program library_with_binary = clCreateProgramWithBinary(
+        context, 1, &deviceID, &binarySize, (const unsigned char **)buffers,
+        loadErrors, &error);
+    test_error(error, "Unable to create program with binary");
 
-    error = create_single_kernel_helper_create_program(context, &another_program, 1, &another_simple_kernel);
-    if( another_program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(
+        context, &another_program, 1, &another_simple_kernel);
+    if (another_program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(another_program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(another_program, 1, &deviceID, NULL, 0, NULL, NULL,
+                             NULL, NULL);
+    test_error(error, "Unable to compile a simple program");
 
-    cl_program program_and_archive[2] = { another_program, library_with_binary };
-    cl_program fully_linked_program = clLinkProgram(context, 1, &deviceID, "", 2, program_and_archive, NULL, NULL, &error);
-    test_error( error, "Unable to create an executable from a binary and a library" );
+    cl_program program_and_archive[2] = { another_program,
+                                          library_with_binary };
+    cl_program fully_linked_program = clLinkProgram(
+        context, 1, &deviceID, "", 2, program_and_archive, NULL, NULL, &error);
+    test_error(error,
+               "Unable to create an executable from a binary and a library");
 
-    cl_kernel kernel = clCreateKernel(fully_linked_program, "CopyBuffer", &error);
-    test_error( error, "Unable to create a simple kernel" );
+    cl_kernel kernel =
+        clCreateKernel(fully_linked_program, "CopyBuffer", &error);
+    test_error(error, "Unable to create a simple kernel");
 
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
-    cl_kernel another_kernel = clCreateKernel(fully_linked_program, "AnotherCopyBuffer", &error);
-    test_error( error, "Unable to create another simple kernel" );
+    cl_kernel another_kernel =
+        clCreateKernel(fully_linked_program, "AnotherCopyBuffer", &error);
+    test_error(error, "Unable to create another simple kernel");
 
     error = verifyCopyBuffer(context, queue, another_kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
 
-    error = clReleaseKernel( another_kernel );
-    test_error( error, "Unable to release another kernel object" );
+    error = clReleaseKernel(another_kernel);
+    test_error(error, "Unable to release another kernel object");
 
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( another_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(another_program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_minted_library );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_minted_library);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( library_with_binary );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(library_with_binary);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( fully_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(fully_linked_program);
+    test_error(error, "Unable to release program object");
 
     free(binary);
 
     return 0;
 }
 
-static void CL_CALLBACK program_compile_completion_callback(cl_program program, void* user_data)
+static void CL_CALLBACK program_compile_completion_callback(cl_program program,
+                                                            void *user_data)
 {
     int error;
     cl_event compile_program_completion_event = (cl_event)user_data;
-    log_info("in the program_compile_completion_callback: program %p just completed compiling with '%p'\n", program, compile_program_completion_event);
+    log_info("in the program_compile_completion_callback: program %p just "
+             "completed compiling with '%p'\n",
+             program, compile_program_completion_event);
 
     error = clSetUserEventStatus(compile_program_completion_event, CL_COMPLETE);
     if (error != CL_SUCCESS)
     {
-        log_error( "ERROR: in the program_compile_completion_callback: Unable to set user event status to CL_COMPLETE! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: in the program_compile_completion_callback: Unable "
+                  "to set user event status to CL_COMPLETE! (%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         exit(-1);
     }
-    log_info("in the program_compile_completion_callback: Successfully signaled compile_program_completion_event event!\n");
+    log_info("in the program_compile_completion_callback: Successfully "
+             "signaled compile_program_completion_event event!\n");
 }
 
-static void CL_CALLBACK program_link_completion_callback(cl_program program, void* user_data)
+static void CL_CALLBACK program_link_completion_callback(cl_program program,
+                                                         void *user_data)
 {
     int error;
     cl_event link_program_completion_event = (cl_event)user_data;
-    log_info("in the program_link_completion_callback: program %p just completed linking with '%p'\n", program, link_program_completion_event);
+    log_info("in the program_link_completion_callback: program %p just "
+             "completed linking with '%p'\n",
+             program, link_program_completion_event);
 
     error = clSetUserEventStatus(link_program_completion_event, CL_COMPLETE);
     if (error != CL_SUCCESS)
     {
-        log_error( "ERROR: in the program_link_completion_callback: Unable to set user event status to CL_COMPLETE! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: in the program_link_completion_callback: Unable to "
+                  "set user event status to CL_COMPLETE! (%s in %s:%d)\n",
+                  IGetErrorString(error), __FILE__, __LINE__);
         exit(-1);
     }
-    log_info("in the program_link_completion_callback: Successfully signaled link_program_completion_event event!\n");
+    log_info("in the program_link_completion_callback: Successfully signaled "
+             "link_program_completion_event event!\n");
 }
 
-int test_execute_after_simple_compile_and_link_with_callbacks(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_execute_after_simple_compile_and_link_with_callbacks(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
 {
     int error;
     cl_program program;
     cl_event compile_program_completion_event, link_program_completion_event;
 
-    log_info("Testing execution after a simple compile and link with callbacks...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    log_info("Testing execution after a simple compile and link with "
+             "callbacks...\n");
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     compile_program_completion_event = clCreateUserEvent(context, &error);
-    test_error( error, "Unable to create a user event");
+    test_error(error, "Unable to create a user event");
 
     error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL,
-        program_compile_completion_callback, (void*)compile_program_completion_event);
-    test_error( error, "Unable to compile a simple program" );
+                             program_compile_completion_callback,
+                             (void *)compile_program_completion_event);
+    test_error(error, "Unable to compile a simple program");
 
     error = clWaitForEvents(1, &compile_program_completion_event);
-    test_error( error, "clWaitForEvents failed when waiting on compile_program_completion_event");
+    test_error(error,
+               "clWaitForEvents failed when waiting on "
+               "compile_program_completion_event");
 
     error = clReleaseEvent(compile_program_completion_event);
-    test_error( error, "Unable to release event object" );
+    test_error(error, "Unable to release event object");
 
     link_program_completion_event = clCreateUserEvent(context, &error);
-    test_error( error, "Unable to create a user event");
+    test_error(error, "Unable to create a user event");
 
-    cl_program my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, 1, &program,
-        program_link_completion_callback, (void*)link_program_completion_event, &error);
-    test_error( error, "Unable to link a simple program" );
+    cl_program my_newly_linked_program =
+        clLinkProgram(context, 1, &deviceID, NULL, 1, &program,
+                      program_link_completion_callback,
+                      (void *)link_program_completion_event, &error);
+    test_error(error, "Unable to link a simple program");
 
     error = clWaitForEvents(1, &link_program_completion_event);
-    test_error( error, "clWaitForEvents failed when waiting on link_program_completion_event");
+    test_error(
+        error,
+        "clWaitForEvents failed when waiting on link_program_completion_event");
 
     error = clReleaseEvent(link_program_completion_event);
-    test_error( error, "Unable to release event object" );
+    test_error(error, "Unable to release event object");
 
-    cl_kernel kernel = clCreateKernel(my_newly_linked_program, "CopyBuffer", &error);
-    test_error( error, "Unable to create a simple kernel" );
+    cl_kernel kernel =
+        clCreateKernel(my_newly_linked_program, "CopyBuffer", &error);
+    test_error(error, "Unable to create a simple kernel");
 
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
 
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_simple_library_only(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_simple_library_only(cl_device_id deviceID, cl_context context,
+                             cl_command_queue queue, int num_elements)
 {
     int error;
     cl_program program;
 
     log_info("Testing creation of a simple library...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple program");
 
-    cl_program my_newly_minted_library = clLinkProgram(context, 1, &deviceID, "-create-library", 1, &program, NULL, NULL, &error);
-    test_error( error, "Unable to create a simple library" );
+    cl_program my_newly_minted_library =
+        clLinkProgram(context, 1, &deviceID, "-create-library", 1, &program,
+                      NULL, NULL, &error);
+    test_error(error, "Unable to create a simple library");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_minted_library );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_minted_library);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_simple_library_with_callback(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_simple_library_with_callback(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue, int num_elements)
 {
     int error;
     cl_program program;
     cl_event link_program_completion_event;
 
     log_info("Testing creation of a simple library with a callback...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple program");
 
     link_program_completion_event = clCreateUserEvent(context, &error);
-    test_error( error, "Unable to create a user event");
+    test_error(error, "Unable to create a user event");
 
-    simple_user_data simple_link_user_data = {when_i_pondered_weak_and_weary, link_program_completion_event};
+    simple_user_data simple_link_user_data = { when_i_pondered_weak_and_weary,
+                                               link_program_completion_event };
 
-    cl_program my_newly_minted_library = clLinkProgram(context, 1, &deviceID, "-create-library", 1, &program,
-        simple_link_callback, (void*)&simple_link_user_data, &error);
-    test_error( error, "Unable to create a simple library" );
+    cl_program my_newly_minted_library = clLinkProgram(
+        context, 1, &deviceID, "-create-library", 1, &program,
+        simple_link_callback, (void *)&simple_link_user_data, &error);
+    test_error(error, "Unable to create a simple library");
 
     error = clWaitForEvents(1, &link_program_completion_event);
-    test_error( error, "clWaitForEvents failed when waiting on link_program_completion_event");
+    test_error(
+        error,
+        "clWaitForEvents failed when waiting on link_program_completion_event");
 
     /* All done! */
     error = clReleaseEvent(link_program_completion_event);
-    test_error( error, "Unable to release event object" );
+    test_error(error, "Unable to release event object");
 
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_minted_library );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_minted_library);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_simple_library_with_link(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_simple_library_with_link(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
 {
     int error;
     cl_program program, another_program;
 
     log_info("Testing creation and linking with a simple library...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple program");
 
-    cl_program my_newly_minted_library = clLinkProgram(context, 1, &deviceID, "-create-library", 1, &program, NULL, NULL, &error);
-    test_error( error, "Unable to create a simple library" );
+    cl_program my_newly_minted_library =
+        clLinkProgram(context, 1, &deviceID, "-create-library", 1, &program,
+                      NULL, NULL, &error);
+    test_error(error, "Unable to create a simple library");
 
-    error = create_single_kernel_helper_create_program(context, &another_program, 1, &another_simple_kernel);
-    if( another_program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(
+        context, &another_program, 1, &another_simple_kernel);
+    if (another_program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(another_program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(another_program, 1, &deviceID, NULL, 0, NULL, NULL,
+                             NULL, NULL);
+    test_error(error, "Unable to compile a simple program");
 
-    cl_program program_and_archive[2] = { another_program, my_newly_minted_library };
-    cl_program fully_linked_program = clLinkProgram(context, 1, &deviceID, "", 2, program_and_archive, NULL, NULL, &error);
-    test_error( error, "Unable to create an executable from a binary and a library" );
+    cl_program program_and_archive[2] = { another_program,
+                                          my_newly_minted_library };
+    cl_program fully_linked_program = clLinkProgram(
+        context, 1, &deviceID, "", 2, program_and_archive, NULL, NULL, &error);
+    test_error(error,
+               "Unable to create an executable from a binary and a library");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( another_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(another_program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_minted_library );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_minted_library);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( fully_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(fully_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_execute_after_simple_library_with_link(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_execute_after_simple_library_with_link(cl_device_id deviceID,
+                                                cl_context context,
+                                                cl_command_queue queue,
+                                                int num_elements)
 {
     int error;
     cl_program program, another_program;
 
-    log_info("Testing execution after linking a binary with a simple library...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    log_info(
+        "Testing execution after linking a binary with a simple library...\n");
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple program");
 
-    cl_program my_newly_minted_library = clLinkProgram(context, 1, &deviceID, "-create-library", 1, &program, NULL, NULL, &error);
-    test_error( error, "Unable to create a simple library" );
+    cl_program my_newly_minted_library =
+        clLinkProgram(context, 1, &deviceID, "-create-library", 1, &program,
+                      NULL, NULL, &error);
+    test_error(error, "Unable to create a simple library");
 
-    error = create_single_kernel_helper_create_program(context, &another_program, 1, &another_simple_kernel);
-    if( another_program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(
+        context, &another_program, 1, &another_simple_kernel);
+    if (another_program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(another_program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(another_program, 1, &deviceID, NULL, 0, NULL, NULL,
+                             NULL, NULL);
+    test_error(error, "Unable to compile a simple program");
 
-    cl_program program_and_archive[2] = { another_program, my_newly_minted_library };
-    cl_program fully_linked_program = clLinkProgram(context, 1, &deviceID, "", 2, program_and_archive, NULL, NULL, &error);
-    test_error( error, "Unable to create an executable from a binary and a library" );
+    cl_program program_and_archive[2] = { another_program,
+                                          my_newly_minted_library };
+    cl_program fully_linked_program = clLinkProgram(
+        context, 1, &deviceID, "", 2, program_and_archive, NULL, NULL, &error);
+    test_error(error,
+               "Unable to create an executable from a binary and a library");
 
-    cl_kernel kernel = clCreateKernel(fully_linked_program, "CopyBuffer", &error);
-    test_error( error, "Unable to create a simple kernel" );
+    cl_kernel kernel =
+        clCreateKernel(fully_linked_program, "CopyBuffer", &error);
+    test_error(error, "Unable to create a simple kernel");
 
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
-    cl_kernel another_kernel = clCreateKernel(fully_linked_program, "AnotherCopyBuffer", &error);
-    test_error( error, "Unable to create another simple kernel" );
+    cl_kernel another_kernel =
+        clCreateKernel(fully_linked_program, "AnotherCopyBuffer", &error);
+    test_error(error, "Unable to create another simple kernel");
 
     error = verifyCopyBuffer(context, queue, another_kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
 
-    error = clReleaseKernel( another_kernel );
-    test_error( error, "Unable to release another kernel object" );
+    error = clReleaseKernel(another_kernel);
+    test_error(error, "Unable to release another kernel object");
 
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( another_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(another_program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_minted_library );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_minted_library);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( fully_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(fully_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_two_file_link(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_two_file_link(cl_device_id deviceID, cl_context context,
+                       cl_command_queue queue, int num_elements)
 {
     int error;
     cl_program program, another_program;
 
     log_info("Testing two file compiling and linking...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple program");
 
 
-    error = create_single_kernel_helper_create_program(context, &another_program, 1, &another_simple_kernel);
-    if( another_program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(
+        context, &another_program, 1, &another_simple_kernel);
+    if (another_program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(another_program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(another_program, 1, &deviceID, NULL, 0, NULL, NULL,
+                             NULL, NULL);
+    test_error(error, "Unable to compile a simple program");
 
     cl_program two_programs[2] = { program, another_program };
-    cl_program fully_linked_program = clLinkProgram(context, 1, &deviceID, "", 2, two_programs, NULL, NULL, &error);
-    test_error( error, "Unable to create an executable from two binaries" );
+    cl_program fully_linked_program = clLinkProgram(
+        context, 1, &deviceID, "", 2, two_programs, NULL, NULL, &error);
+    test_error(error, "Unable to create an executable from two binaries");
 
     /* All done! */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( another_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(another_program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( fully_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(fully_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_execute_after_two_file_link(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_execute_after_two_file_link(cl_device_id deviceID, cl_context context,
+                                     cl_command_queue queue, int num_elements)
 {
     int error;
     cl_program program, another_program;
 
-    log_info("Testing two file compiling and linking and execution of two kernels afterwards ...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    log_info("Testing two file compiling and linking and execution of two "
+             "kernels afterwards ...\n");
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple program");
 
-    error = create_single_kernel_helper_create_program(context, &another_program, 1, &another_simple_kernel);
-    if( another_program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(
+        context, &another_program, 1, &another_simple_kernel);
+    if (another_program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(another_program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(another_program, 1, &deviceID, NULL, 0, NULL, NULL,
+                             NULL, NULL);
+    test_error(error, "Unable to compile a simple program");
 
     cl_program two_programs[2] = { program, another_program };
-    cl_program fully_linked_program = clLinkProgram(context, 1, &deviceID, "", 2, two_programs, NULL, NULL, &error);
-    test_error( error, "Unable to create an executable from two binaries" );
+    cl_program fully_linked_program = clLinkProgram(
+        context, 1, &deviceID, "", 2, two_programs, NULL, NULL, &error);
+    test_error(error, "Unable to create an executable from two binaries");
 
-    cl_kernel kernel = clCreateKernel(fully_linked_program, "CopyBuffer", &error);
-    test_error( error, "Unable to create a simple kernel" );
+    cl_kernel kernel =
+        clCreateKernel(fully_linked_program, "CopyBuffer", &error);
+    test_error(error, "Unable to create a simple kernel");
 
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
-    cl_kernel another_kernel = clCreateKernel(fully_linked_program, "AnotherCopyBuffer", &error);
-    test_error( error, "Unable to create another simple kernel" );
+    cl_kernel another_kernel =
+        clCreateKernel(fully_linked_program, "AnotherCopyBuffer", &error);
+    test_error(error, "Unable to create another simple kernel");
 
     error = verifyCopyBuffer(context, queue, another_kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
 
-    error = clReleaseKernel( another_kernel );
-    test_error( error, "Unable to release another kernel object" );
+    error = clReleaseKernel(another_kernel);
+    test_error(error, "Unable to release another kernel object");
 
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( another_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(another_program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( fully_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(fully_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_execute_after_embedded_header_link(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_execute_after_embedded_header_link(cl_device_id deviceID,
+                                            cl_context context,
+                                            cl_command_queue queue,
+                                            int num_elements)
 {
     int error;
     cl_program program, header, simple_program;
 
     log_info("Testing execution after embedded header link...\n");
     // we will test execution after compiling and linking with embedded headers
-    program = clCreateProgramWithSource(context, 1, &another_simple_kernel_with_header, NULL, &error);
-    if( program == NULL || error != CL_SUCCESS )
+    program = clCreateProgramWithSource(
+        context, 1, &another_simple_kernel_with_header, NULL, &error);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    header = clCreateProgramWithSource(context, 1, &simple_header, NULL, &error);
-    if( header == NULL || error != CL_SUCCESS )
+    header =
+        clCreateProgramWithSource(context, 1, &simple_header, NULL, &error);
+    if (header == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple header program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple header program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 1, &header, &simple_header_name, NULL, NULL);
-    test_error( error, "Unable to compile a simple program with embedded header" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 1, &header,
+                             &simple_header_name, NULL, NULL);
+    test_error(error,
+               "Unable to compile a simple program with embedded header");
 
-    simple_program = clCreateProgramWithSource(context, 1, &simple_kernel, NULL, &error);
-    if( simple_program == NULL || error != CL_SUCCESS )
+    simple_program =
+        clCreateProgramWithSource(context, 1, &simple_kernel, NULL, &error);
+    if (simple_program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(simple_program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(simple_program, 1, &deviceID, NULL, 0, NULL, NULL,
+                             NULL, NULL);
+    test_error(error, "Unable to compile a simple program");
 
     cl_program two_programs[2] = { program, simple_program };
-    cl_program fully_linked_program = clLinkProgram(context, 1, &deviceID, "", 2, two_programs, NULL, NULL, &error);
-    test_error( error, "Unable to create an executable from two binaries, one compiled with embedded header" );
+    cl_program fully_linked_program = clLinkProgram(
+        context, 1, &deviceID, "", 2, two_programs, NULL, NULL, &error);
+    test_error(error,
+               "Unable to create an executable from two binaries, one compiled "
+               "with embedded header");
 
-    cl_kernel kernel = clCreateKernel(fully_linked_program, "CopyBuffer", &error);
-    test_error( error, "Unable to create a simple kernel" );
+    cl_kernel kernel =
+        clCreateKernel(fully_linked_program, "CopyBuffer", &error);
+    test_error(error, "Unable to create a simple kernel");
 
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
-    cl_kernel another_kernel = clCreateKernel(fully_linked_program, "AnotherCopyBuffer", &error);
-    test_error( error, "Unable to create another simple kernel" );
+    cl_kernel another_kernel =
+        clCreateKernel(fully_linked_program, "AnotherCopyBuffer", &error);
+    test_error(error, "Unable to create another simple kernel");
 
     error = verifyCopyBuffer(context, queue, another_kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
 
-    error = clReleaseKernel( another_kernel );
-    test_error( error, "Unable to release another kernel object" );
+    error = clReleaseKernel(another_kernel);
+    test_error(error, "Unable to release another kernel object");
 
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( header );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(header);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( simple_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(simple_program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( fully_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(fully_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
 #if defined(__APPLE__) || defined(__linux)
-#define _mkdir(x) mkdir(x,S_IRWXU)
+#define _mkdir(x) mkdir(x, S_IRWXU)
 #define _chdir chdir
 #define _rmdir rmdir
 #define _unlink unlink
@@ -2517,461 +3078,602 @@
 #include <direct.h>
 #endif
 
-int test_execute_after_included_header_link(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_execute_after_included_header_link(cl_device_id deviceID,
+                                            cl_context context,
+                                            cl_command_queue queue,
+                                            int num_elements)
 {
     int error;
     cl_program program, simple_program;
 
     log_info("Testing execution after included header link...\n");
     // we will test execution after compiling and linking with included headers
-    program = clCreateProgramWithSource(context, 1, &another_simple_kernel_with_header, NULL, &error);
-    if( program == NULL || error != CL_SUCCESS )
+    program = clCreateProgramWithSource(
+        context, 1, &another_simple_kernel_with_header, NULL, &error);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     /* setup */
-#if (defined(__linux__) || defined(__APPLE__)) && (!defined( __ANDROID__ ))
+#if (defined(__linux__) || defined(__APPLE__)) && (!defined(__ANDROID__))
     /* Some tests systems doesn't allow one to write in the test directory */
-    if (_chdir("/tmp") != 0) {
-        log_error( "ERROR: Unable to remove directory foo/bar! (in %s:%d)\n", __FILE__, __LINE__ );
+    if (_chdir("/tmp") != 0)
+    {
+        log_error("ERROR: Unable to remove directory foo/bar! (in %s:%d)\n",
+                  __FILE__, __LINE__);
         return -1;
     }
 #endif
-    if (_mkdir("foo") != 0) {
-        log_error( "ERROR: Unable to create directory foo! (in %s:%d)\n", __FILE__, __LINE__ );
+    if (_mkdir("foo") != 0)
+    {
+        log_error("ERROR: Unable to create directory foo! (in %s:%d)\n",
+                  __FILE__, __LINE__);
         return -1;
     }
-    if (_mkdir("foo/bar") != 0) {
-        log_error( "ERROR: Unable to create directory foo/bar! (in %s:%d)\n", __FILE__, __LINE__ );
+    if (_mkdir("foo/bar") != 0)
+    {
+        log_error("ERROR: Unable to create directory foo/bar! (in %s:%d)\n",
+                  __FILE__, __LINE__);
         return -1;
     }
-    if (_chdir("foo/bar") != 0) {
-        log_error( "ERROR: Unable to change to directory foo/bar! (in %s:%d)\n", __FILE__, __LINE__ );
+    if (_chdir("foo/bar") != 0)
+    {
+        log_error("ERROR: Unable to change to directory foo/bar! (in %s:%d)\n",
+                  __FILE__, __LINE__);
         return -1;
     }
-    FILE* simple_header_file = fopen(simple_header_name, "w");
-    if (simple_header_file == NULL) {
-        log_error( "ERROR: Unable to create simple header file %s! (in %s:%d)\n", simple_header_name, __FILE__, __LINE__ );
+    FILE *simple_header_file = fopen(simple_header_name, "w");
+    if (simple_header_file == NULL)
+    {
+        log_error("ERROR: Unable to create simple header file %s! (in %s:%d)\n",
+                  simple_header_name, __FILE__, __LINE__);
         return -1;
     }
-    if (fprintf(simple_header_file, "%s", simple_header) < 0) {
-        log_error( "ERROR: Unable to write to simple header file %s! (in %s:%d)\n", simple_header_name, __FILE__, __LINE__);
+    if (fprintf(simple_header_file, "%s", simple_header) < 0)
+    {
+        log_error(
+            "ERROR: Unable to write to simple header file %s! (in %s:%d)\n",
+            simple_header_name, __FILE__, __LINE__);
         return -1;
     }
-    if (fclose(simple_header_file) != 0) {
-        log_error( "ERROR: Unable to close simple header file %s! (in %s:%d)\n", simple_header_name, __FILE__, __LINE__);
+    if (fclose(simple_header_file) != 0)
+    {
+        log_error("ERROR: Unable to close simple header file %s! (in %s:%d)\n",
+                  simple_header_name, __FILE__, __LINE__);
         return -1;
     }
-    if (_chdir("../..") != 0) {
-        log_error( "ERROR: Unable to change to original working directory! (in %s:%d)\n", __FILE__, __LINE__);
+    if (_chdir("../..") != 0)
+    {
+        log_error("ERROR: Unable to change to original working directory! (in "
+                  "%s:%d)\n",
+                  __FILE__, __LINE__);
         return -1;
     }
-#if (defined(__linux__) || defined(__APPLE__)) && (!defined( __ANDROID__ ))
-    error = clCompileProgram(program, 1, &deviceID, "-I/tmp/foo/bar", 0, NULL, NULL, NULL, NULL);
+#if (defined(__linux__) || defined(__APPLE__)) && (!defined(__ANDROID__))
+    error = clCompileProgram(program, 1, &deviceID, "-I/tmp/foo/bar", 0, NULL,
+                             NULL, NULL, NULL);
 #else
-    error = clCompileProgram(program, 1, &deviceID, "-Ifoo/bar", 0, NULL, NULL, NULL, NULL);
+    error = clCompileProgram(program, 1, &deviceID, "-Ifoo/bar", 0, NULL, NULL,
+                             NULL, NULL);
 #endif
-    test_error( error, "Unable to compile a simple program with included header" );
+    test_error(error,
+               "Unable to compile a simple program with included header");
 
     /* cleanup */
-    if (_chdir("foo/bar") != 0) {
-        log_error( "ERROR: Unable to change to directory foo/bar! (in %s:%d)\n", __FILE__, __LINE__ );
-        return -1;
-    }
-    if (_unlink(simple_header_name) != 0) {
-        log_error( "ERROR: Unable to remove simple header file %s! (in %s:%d)\n", simple_header_name, __FILE__, __LINE__ );
-        return -1;
-    }
-    if (_chdir("../..") != 0) {
-        log_error( "ERROR: Unable to change to original working directory! (in %s:%d)\n", __FILE__, __LINE__ );
-        return -1;
-    }
-    if (_rmdir("foo/bar") != 0) {
-        log_error( "ERROR: Unable to remove directory foo/bar! (in %s:%d)\n", __FILE__, __LINE__ );
-        return -1;
-    }
-    if (_rmdir("foo") != 0) {
-        log_error( "ERROR: Unable to remove directory foo! (in %s:%d)\n", __FILE__, __LINE__ );
-        return -1;
-    }
-
-    simple_program = clCreateProgramWithSource(context, 1, &simple_kernel, NULL, &error);
-    if( simple_program == NULL || error != CL_SUCCESS )
+    if (_chdir("foo/bar") != 0)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to change to directory foo/bar! (in %s:%d)\n",
+                  __FILE__, __LINE__);
+        return -1;
+    }
+    if (_unlink(simple_header_name) != 0)
+    {
+        log_error("ERROR: Unable to remove simple header file %s! (in %s:%d)\n",
+                  simple_header_name, __FILE__, __LINE__);
+        return -1;
+    }
+    if (_chdir("../..") != 0)
+    {
+        log_error("ERROR: Unable to change to original working directory! (in "
+                  "%s:%d)\n",
+                  __FILE__, __LINE__);
+        return -1;
+    }
+    if (_rmdir("foo/bar") != 0)
+    {
+        log_error("ERROR: Unable to remove directory foo/bar! (in %s:%d)\n",
+                  __FILE__, __LINE__);
+        return -1;
+    }
+    if (_rmdir("foo") != 0)
+    {
+        log_error("ERROR: Unable to remove directory foo! (in %s:%d)\n",
+                  __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(simple_program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    simple_program =
+        clCreateProgramWithSource(context, 1, &simple_kernel, NULL, &error);
+    if (simple_program == NULL || error != CL_SUCCESS)
+    {
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
+        return -1;
+    }
+
+    error = clCompileProgram(simple_program, 1, &deviceID, NULL, 0, NULL, NULL,
+                             NULL, NULL);
+    test_error(error, "Unable to compile a simple program");
 
     cl_program two_programs[2] = { program, simple_program };
-    cl_program fully_linked_program = clLinkProgram(context, 1, &deviceID, "", 2, two_programs, NULL, NULL, &error);
-    test_error( error, "Unable to create an executable from two binaries, one compiled with embedded header" );
+    cl_program fully_linked_program = clLinkProgram(
+        context, 1, &deviceID, "", 2, two_programs, NULL, NULL, &error);
+    test_error(error,
+               "Unable to create an executable from two binaries, one compiled "
+               "with embedded header");
 
-    cl_kernel kernel = clCreateKernel(fully_linked_program, "CopyBuffer", &error);
-    test_error( error, "Unable to create a simple kernel" );
+    cl_kernel kernel =
+        clCreateKernel(fully_linked_program, "CopyBuffer", &error);
+    test_error(error, "Unable to create a simple kernel");
 
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
-    cl_kernel another_kernel = clCreateKernel(fully_linked_program, "AnotherCopyBuffer", &error);
-    test_error( error, "Unable to create another simple kernel" );
+    cl_kernel another_kernel =
+        clCreateKernel(fully_linked_program, "AnotherCopyBuffer", &error);
+    test_error(error, "Unable to create another simple kernel");
 
     error = verifyCopyBuffer(context, queue, another_kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
 
-    error = clReleaseKernel( another_kernel );
-    test_error( error, "Unable to release another kernel object" );
+    error = clReleaseKernel(another_kernel);
+    test_error(error, "Unable to release another kernel object");
 
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( simple_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(simple_program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( fully_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(fully_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_program_binary_type(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_program_binary_type(cl_device_id deviceID, cl_context context,
+                             cl_command_queue queue, int num_elements)
 {
     int error;
-    cl_program program, another_program, program_with_binary, fully_linked_program_with_binary;
+    cl_program program, another_program, program_with_binary,
+        fully_linked_program_with_binary;
     cl_program_binary_type program_type = -1;
     size_t size;
-    size_t            binarySize;
+    size_t binarySize;
     unsigned char *binary;
 
     log_info("Testing querying of program binary type...\n");
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, NULL,
+                             NULL);
+    test_error(error, "Unable to compile a simple program");
 
-  error = clGetProgramBuildInfo (program, deviceID, CL_PROGRAM_BINARY_TYPE, sizeof(cl_program_binary_type), &program_type, NULL);
-    test_error( error, "Unable to get program binary type" );
+    error = clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BINARY_TYPE,
+                                  sizeof(cl_program_binary_type), &program_type,
+                                  NULL);
+    test_error(error, "Unable to get program binary type");
     if (program_type != CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT)
     {
-        log_error( "ERROR: Expected program type of a just compiled program to be CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT (in %s:%d)\n", __FILE__, __LINE__ );
+        log_error("ERROR: Expected program type of a just compiled program to "
+                  "be CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT (in %s:%d)\n",
+                  __FILE__, __LINE__);
         return -1;
     }
     program_type = -1;
 
     // Get the size of the resulting binary (only one device)
-    error = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof( binarySize ), &binarySize, NULL );
-    test_error( error, "Unable to get binary size" );
+    error = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
+                             sizeof(binarySize), &binarySize, NULL);
+    test_error(error, "Unable to get binary size");
 
     // Sanity check
-    if( binarySize == 0 )
+    if (binarySize == 0)
     {
-        log_error( "ERROR: Binary size of program is zero (in %s:%d)\n", __FILE__, __LINE__ );
+        log_error("ERROR: Binary size of program is zero (in %s:%d)\n",
+                  __FILE__, __LINE__);
         return -1;
     }
 
     // Create a buffer and get the actual binary
     {
-        binary = (unsigned char*)malloc(sizeof(unsigned char)*binarySize);
-        if (binary == NULL) {
-            log_error( "ERROR: Unable to allocate binary character array with %lu characters! (in %s:%d)\n", binarySize, __FILE__, __LINE__ );
+        binary = (unsigned char *)malloc(sizeof(unsigned char) * binarySize);
+        if (binary == NULL)
+        {
+            log_error("ERROR: Unable to allocate binary character array with "
+                      "%lu characters! (in %s:%d)\n",
+                      binarySize, __FILE__, __LINE__);
             return -1;
         }
-        unsigned char *buffers[ 1 ] = { binary };
-        cl_int loadErrors[ 1 ];
+        unsigned char *buffers[1] = { binary };
+        cl_int loadErrors[1];
 
         // Do another sanity check here first
         size_t size;
-        error = clGetProgramInfo( program, CL_PROGRAM_BINARIES, 0, NULL, &size );
-        test_error( error, "Unable to get expected size of binaries array" );
-        if( size != sizeof( buffers ) )
+        error = clGetProgramInfo(program, CL_PROGRAM_BINARIES, 0, NULL, &size);
+        test_error(error, "Unable to get expected size of binaries array");
+        if (size != sizeof(buffers))
         {
-            log_error( "ERROR: Expected size of binaries array in clGetProgramInfo is incorrect (should be %d, got %d) (in %s:%d)\n", (int)sizeof( buffers ), (int)size, __FILE__, __LINE__ );
+            log_error(
+                "ERROR: Expected size of binaries array in clGetProgramInfo is "
+                "incorrect (should be %d, got %d) (in %s:%d)\n",
+                (int)sizeof(buffers), (int)size, __FILE__, __LINE__);
             free(binary);
             return -1;
         }
 
-        error = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof( buffers ), &buffers, NULL );
-        test_error( error, "Unable to get program binary" );
+        error = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(buffers),
+                                 &buffers, NULL);
+        test_error(error, "Unable to get program binary");
 
         // use clCreateProgramWithBinary
-        program_with_binary = clCreateProgramWithBinary(context, 1, &deviceID, &binarySize, (const unsigned char**)buffers, loadErrors, &error);
-        test_error( error, "Unable to create program with binary" );
+        program_with_binary = clCreateProgramWithBinary(
+            context, 1, &deviceID, &binarySize, (const unsigned char **)buffers,
+            loadErrors, &error);
+        test_error(error, "Unable to create program with binary");
 
-        error = clGetProgramBuildInfo (program_with_binary, deviceID, CL_PROGRAM_BINARY_TYPE, sizeof(cl_program_binary_type), &program_type, NULL);
-        test_error( error, "Unable to get program binary type" );
+        error = clGetProgramBuildInfo(
+            program_with_binary, deviceID, CL_PROGRAM_BINARY_TYPE,
+            sizeof(cl_program_binary_type), &program_type, NULL);
+        test_error(error, "Unable to get program binary type");
         if (program_type != CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT)
         {
-            log_error( "ERROR: Expected program type of a program created from compiled object to be CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT (in %s:%d)\n", __FILE__, __LINE__ );
+            log_error("ERROR: Expected program type of a program created from "
+                      "compiled object to be "
+                      "CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT (in %s:%d)\n",
+                      __FILE__, __LINE__);
             return -1;
         }
         program_type = -1;
         free(binary);
     }
 
-    cl_program my_newly_minted_library = clLinkProgram(context, 1, &deviceID, "-create-library", 1, &program_with_binary, NULL, NULL, &error);
-  test_error( error, "Unable to create a simple library" );
-  error = clGetProgramBuildInfo (my_newly_minted_library, deviceID, CL_PROGRAM_BINARY_TYPE, sizeof(cl_program_binary_type), &program_type, NULL);
-    test_error( error, "Unable to get program binary type" );
+    cl_program my_newly_minted_library =
+        clLinkProgram(context, 1, &deviceID, "-create-library", 1,
+                      &program_with_binary, NULL, NULL, &error);
+    test_error(error, "Unable to create a simple library");
+    error = clGetProgramBuildInfo(
+        my_newly_minted_library, deviceID, CL_PROGRAM_BINARY_TYPE,
+        sizeof(cl_program_binary_type), &program_type, NULL);
+    test_error(error, "Unable to get program binary type");
     if (program_type != CL_PROGRAM_BINARY_TYPE_LIBRARY)
     {
-        log_error( "ERROR: Expected program type of a just linked library to be CL_PROGRAM_BINARY_TYPE_LIBRARY (in %s:%d)\n", __FILE__, __LINE__ );
+        log_error("ERROR: Expected program type of a just linked library to be "
+                  "CL_PROGRAM_BINARY_TYPE_LIBRARY (in %s:%d)\n",
+                  __FILE__, __LINE__);
         return -1;
     }
     program_type = -1;
 
     // Get the size of the resulting library (only one device)
-    error = clGetProgramInfo( my_newly_minted_library, CL_PROGRAM_BINARY_SIZES, sizeof( binarySize ), &binarySize, NULL );
-    test_error( error, "Unable to get binary size" );
+    error = clGetProgramInfo(my_newly_minted_library, CL_PROGRAM_BINARY_SIZES,
+                             sizeof(binarySize), &binarySize, NULL);
+    test_error(error, "Unable to get binary size");
 
     // Sanity check
-    if( binarySize == 0 )
+    if (binarySize == 0)
     {
-        log_error( "ERROR: Binary size of program is zero (in %s:%d)\n", __FILE__, __LINE__ );
+        log_error("ERROR: Binary size of program is zero (in %s:%d)\n",
+                  __FILE__, __LINE__);
         return -1;
     }
 
     // Create a buffer and get the actual binary
-    binary = (unsigned char*)malloc(sizeof(unsigned char)*binarySize);
-    if (binary == NULL) {
-        log_error( "ERROR: Unable to allocate binary character array with %lu characters! (in %s:%d)\n", binarySize, __FILE__, __LINE__);
+    binary = (unsigned char *)malloc(sizeof(unsigned char) * binarySize);
+    if (binary == NULL)
+    {
+        log_error("ERROR: Unable to allocate binary character array with %lu "
+                  "characters! (in %s:%d)\n",
+                  binarySize, __FILE__, __LINE__);
         return -1;
     }
 
-    unsigned char *buffers[ 1 ] = { binary };
-    cl_int loadErrors[ 1 ];
+    unsigned char *buffers[1] = { binary };
+    cl_int loadErrors[1];
 
     // Do another sanity check here first
-    error = clGetProgramInfo( my_newly_minted_library, CL_PROGRAM_BINARIES, 0, NULL, &size );
-    test_error( error, "Unable to get expected size of binaries array" );
-    if( size != sizeof( buffers ) )
+    error = clGetProgramInfo(my_newly_minted_library, CL_PROGRAM_BINARIES, 0,
+                             NULL, &size);
+    test_error(error, "Unable to get expected size of binaries array");
+    if (size != sizeof(buffers))
     {
-        log_error( "ERROR: Expected size of binaries array in clGetProgramInfo is incorrect (should be %d, got %d) (in %s:%d)\n", (int)sizeof( buffers ), (int)size, __FILE__, __LINE__ );
+        log_error("ERROR: Expected size of binaries array in clGetProgramInfo "
+                  "is incorrect (should be %d, got %d) (in %s:%d)\n",
+                  (int)sizeof(buffers), (int)size, __FILE__, __LINE__);
         free(binary);
         return -1;
     }
 
-    error = clGetProgramInfo( my_newly_minted_library, CL_PROGRAM_BINARIES, sizeof( buffers ), &buffers, NULL );
-    test_error( error, "Unable to get program binary" );
+    error = clGetProgramInfo(my_newly_minted_library, CL_PROGRAM_BINARIES,
+                             sizeof(buffers), &buffers, NULL);
+    test_error(error, "Unable to get program binary");
 
     // use clCreateProgramWithBinary
-    cl_program library_with_binary = clCreateProgramWithBinary(context, 1, &deviceID, &binarySize, (const unsigned char**)buffers, loadErrors, &error);
-    test_error( error, "Unable to create program with binary" );
-  error = clGetProgramBuildInfo (library_with_binary, deviceID, CL_PROGRAM_BINARY_TYPE, sizeof(cl_program_binary_type), &program_type, NULL);
-    test_error( error, "Unable to get program binary type" );
+    cl_program library_with_binary = clCreateProgramWithBinary(
+        context, 1, &deviceID, &binarySize, (const unsigned char **)buffers,
+        loadErrors, &error);
+    test_error(error, "Unable to create program with binary");
+    error = clGetProgramBuildInfo(
+        library_with_binary, deviceID, CL_PROGRAM_BINARY_TYPE,
+        sizeof(cl_program_binary_type), &program_type, NULL);
+    test_error(error, "Unable to get program binary type");
     if (program_type != CL_PROGRAM_BINARY_TYPE_LIBRARY)
     {
-        log_error( "ERROR: Expected program type of a library loaded with binary to be CL_PROGRAM_BINARY_TYPE_LIBRARY (in %s:%d)\n", __FILE__, __LINE__ );
+        log_error("ERROR: Expected program type of a library loaded with "
+                  "binary to be CL_PROGRAM_BINARY_TYPE_LIBRARY (in %s:%d)\n",
+                  __FILE__, __LINE__);
         return -1;
     }
     program_type = -1;
-  free(binary);
+    free(binary);
 
-    error = create_single_kernel_helper_create_program(context, &another_program, 1, &another_simple_kernel);
-    if( another_program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(
+        context, &another_program, 1, &another_simple_kernel);
+    if (another_program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
-    error = clCompileProgram(another_program, 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-    test_error( error, "Unable to compile a simple program" );
+    error = clCompileProgram(another_program, 1, &deviceID, NULL, 0, NULL, NULL,
+                             NULL, NULL);
+    test_error(error, "Unable to compile a simple program");
 
-    cl_program program_and_archive[2] = { another_program, library_with_binary };
-    cl_program fully_linked_program = clLinkProgram(context, 1, &deviceID, "", 2, program_and_archive, NULL, NULL, &error);
-    test_error( error, "Unable to create an executable from a binary and a library" );
+    cl_program program_and_archive[2] = { another_program,
+                                          library_with_binary };
+    cl_program fully_linked_program = clLinkProgram(
+        context, 1, &deviceID, "", 2, program_and_archive, NULL, NULL, &error);
+    test_error(error,
+               "Unable to create an executable from a binary and a library");
 
-  error = clGetProgramBuildInfo (fully_linked_program, deviceID, CL_PROGRAM_BINARY_TYPE, sizeof(cl_program_binary_type), &program_type, NULL);
-    test_error( error, "Unable to get program binary type" );
+    error = clGetProgramBuildInfo(
+        fully_linked_program, deviceID, CL_PROGRAM_BINARY_TYPE,
+        sizeof(cl_program_binary_type), &program_type, NULL);
+    test_error(error, "Unable to get program binary type");
     if (program_type != CL_PROGRAM_BINARY_TYPE_EXECUTABLE)
     {
-        log_error( "ERROR: Expected program type of a newly build executable to be CL_PROGRAM_BINARY_TYPE_EXECUTABLE (in %s:%d)\n", __FILE__, __LINE__ );
+        log_error("ERROR: Expected program type of a newly build executable to "
+                  "be CL_PROGRAM_BINARY_TYPE_EXECUTABLE (in %s:%d)\n",
+                  __FILE__, __LINE__);
         return -1;
     }
     program_type = -1;
 
     // Get the size of the resulting binary (only one device)
-    error = clGetProgramInfo( fully_linked_program, CL_PROGRAM_BINARY_SIZES, sizeof( binarySize ), &binarySize, NULL );
-    test_error( error, "Unable to get binary size" );
+    error = clGetProgramInfo(fully_linked_program, CL_PROGRAM_BINARY_SIZES,
+                             sizeof(binarySize), &binarySize, NULL);
+    test_error(error, "Unable to get binary size");
 
     // Sanity check
-    if( binarySize == 0 )
+    if (binarySize == 0)
     {
-        log_error( "ERROR: Binary size of program is zero (in %s:%d)\n", __FILE__, __LINE__ );
+        log_error("ERROR: Binary size of program is zero (in %s:%d)\n",
+                  __FILE__, __LINE__);
         return -1;
     }
 
     // Create a buffer and get the actual binary
     {
-        binary = (unsigned char*)malloc(sizeof(unsigned char)*binarySize);
-        if (binary == NULL) {
-            log_error( "ERROR: Unable to allocate binary character array with %lu characters! (in %s:%d)\n", binarySize, __FILE__, __LINE__ );
+        binary = (unsigned char *)malloc(sizeof(unsigned char) * binarySize);
+        if (binary == NULL)
+        {
+            log_error("ERROR: Unable to allocate binary character array with "
+                      "%lu characters! (in %s:%d)\n",
+                      binarySize, __FILE__, __LINE__);
             return -1;
         }
-        unsigned char *buffers[ 1 ] = { binary };
-        cl_int loadErrors[ 1 ];
+        unsigned char *buffers[1] = { binary };
+        cl_int loadErrors[1];
 
         // Do another sanity check here first
         size_t size;
-        error = clGetProgramInfo( fully_linked_program, CL_PROGRAM_BINARIES, 0, NULL, &size );
-        test_error( error, "Unable to get expected size of binaries array" );
-        if( size != sizeof( buffers ) )
+        error = clGetProgramInfo(fully_linked_program, CL_PROGRAM_BINARIES, 0,
+                                 NULL, &size);
+        test_error(error, "Unable to get expected size of binaries array");
+        if (size != sizeof(buffers))
         {
-            log_error( "ERROR: Expected size of binaries array in clGetProgramInfo is incorrect (should be %d, got %d) (in %s:%d)\n", (int)sizeof( buffers ), (int)size, __FILE__, __LINE__ );
+            log_error(
+                "ERROR: Expected size of binaries array in clGetProgramInfo is "
+                "incorrect (should be %d, got %d) (in %s:%d)\n",
+                (int)sizeof(buffers), (int)size, __FILE__, __LINE__);
             free(binary);
             return -1;
         }
 
-        error = clGetProgramInfo( fully_linked_program, CL_PROGRAM_BINARIES, sizeof( buffers ), &buffers, NULL );
-        test_error( error, "Unable to get program binary" );
+        error = clGetProgramInfo(fully_linked_program, CL_PROGRAM_BINARIES,
+                                 sizeof(buffers), &buffers, NULL);
+        test_error(error, "Unable to get program binary");
 
         // use clCreateProgramWithBinary
-        fully_linked_program_with_binary = clCreateProgramWithBinary(context, 1, &deviceID, &binarySize, (const unsigned char**)buffers, loadErrors, &error);
-        test_error( error, "Unable to create program with binary" );
+        fully_linked_program_with_binary = clCreateProgramWithBinary(
+            context, 1, &deviceID, &binarySize, (const unsigned char **)buffers,
+            loadErrors, &error);
+        test_error(error, "Unable to create program with binary");
 
-    error = clGetProgramBuildInfo (fully_linked_program_with_binary, deviceID, CL_PROGRAM_BINARY_TYPE, sizeof(cl_program_binary_type), &program_type, NULL);
-        test_error( error, "Unable to get program binary type" );
+        error = clGetProgramBuildInfo(
+            fully_linked_program_with_binary, deviceID, CL_PROGRAM_BINARY_TYPE,
+            sizeof(cl_program_binary_type), &program_type, NULL);
+        test_error(error, "Unable to get program binary type");
         if (program_type != CL_PROGRAM_BINARY_TYPE_EXECUTABLE)
         {
-            log_error( "ERROR: Expected program type of a program created from a fully linked executable binary to be CL_PROGRAM_BINARY_TYPE_EXECUTABLE (in %s:%d)\n", __FILE__, __LINE__ );
+            log_error("ERROR: Expected program type of a program created from "
+                      "a fully linked executable binary to be "
+                      "CL_PROGRAM_BINARY_TYPE_EXECUTABLE (in %s:%d)\n",
+                      __FILE__, __LINE__);
             return -1;
         }
         program_type = -1;
         free(binary);
     }
 
-    error = clBuildProgram(fully_linked_program_with_binary, 1, &deviceID, NULL, NULL, NULL);
-        test_error( error, "Unable to build a simple program" );
+    error = clBuildProgram(fully_linked_program_with_binary, 1, &deviceID, NULL,
+                           NULL, NULL);
+    test_error(error, "Unable to build a simple program");
 
-    cl_kernel kernel = clCreateKernel(fully_linked_program_with_binary, "CopyBuffer", &error);
-    test_error( error, "Unable to create a simple kernel" );
+    cl_kernel kernel =
+        clCreateKernel(fully_linked_program_with_binary, "CopyBuffer", &error);
+    test_error(error, "Unable to create a simple kernel");
 
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
-    cl_kernel another_kernel = clCreateKernel(fully_linked_program_with_binary, "AnotherCopyBuffer", &error);
-    test_error( error, "Unable to create another simple kernel" );
+    cl_kernel another_kernel = clCreateKernel(fully_linked_program_with_binary,
+                                              "AnotherCopyBuffer", &error);
+    test_error(error, "Unable to create another simple kernel");
 
     error = verifyCopyBuffer(context, queue, another_kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
 
-    error = clReleaseKernel( another_kernel );
-    test_error( error, "Unable to release another kernel object" );
+    error = clReleaseKernel(another_kernel);
+    test_error(error, "Unable to release another kernel object");
 
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    /* Oh, one more thing. Steve Jobs and apparently Herb Sutter. The question is "Who is copying whom?" */
-    error = create_single_kernel_helper_create_program(context, &program, 1, &simple_kernel);
-    if( program == NULL || error != CL_SUCCESS )
+    /* Oh, one more thing. Steve Jobs and apparently Herb Sutter. The question
+     * is "Who is copying whom?" */
+    error = create_single_kernel_helper_create_program(context, &program, 1,
+                                                       &simple_kernel);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create a simple test program! (%s in %s:%d)\n", IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Unable to create a simple test program! (%s in %s:%d)\n",
+            IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     error = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);
-    test_error( error, "Unable to build a simple program" );
-  error = clGetProgramBuildInfo (program, deviceID, CL_PROGRAM_BINARY_TYPE, sizeof(cl_program_binary_type), &program_type, NULL);
-    test_error( error, "Unable to get program binary type" );
+    test_error(error, "Unable to build a simple program");
+    error = clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BINARY_TYPE,
+                                  sizeof(cl_program_binary_type), &program_type,
+                                  NULL);
+    test_error(error, "Unable to get program binary type");
     if (program_type != CL_PROGRAM_BINARY_TYPE_EXECUTABLE)
     {
-        log_error( "ERROR: Expected program type of a program created from compiled object to be CL_PROGRAM_BINARY_TYPE_EXECUTABLE (in %s:%d)\n", __FILE__, __LINE__ );
+        log_error(
+            "ERROR: Expected program type of a program created from compiled "
+            "object to be CL_PROGRAM_BINARY_TYPE_EXECUTABLE (in %s:%d)\n",
+            __FILE__, __LINE__);
         return -1;
     }
     program_type = -1;
 
     /* All's well that ends well. William Shakespeare */
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( another_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(another_program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_minted_library );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_minted_library);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( library_with_binary );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(library_with_binary);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( fully_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(fully_linked_program);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( fully_linked_program_with_binary );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(fully_linked_program_with_binary);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( program_with_binary );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program_with_binary);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-volatile int       compileNotificationSent;
+volatile int compileNotificationSent;
 
-void CL_CALLBACK test_notify_compile_complete( cl_program program, void *userData )
+void CL_CALLBACK test_notify_compile_complete(cl_program program,
+                                              void *userData)
 {
-    if( userData == NULL || strcmp( (char *)userData, "compilation" ) != 0 )
+    if (userData == NULL || strcmp((char *)userData, "compilation") != 0)
     {
-        log_error( "ERROR: User data passed in to compile notify function was not correct! (in %s:%d)\n", __FILE__, __LINE__ );
+        log_error("ERROR: User data passed in to compile notify function was "
+                  "not correct! (in %s:%d)\n",
+                  __FILE__, __LINE__);
         compileNotificationSent = -1;
     }
     else
         compileNotificationSent = 1;
-    log_info( "\n   <-- program successfully compiled\n" );
+    log_info("\n   <-- program successfully compiled\n");
 }
 
-volatile int       libraryCreationNotificationSent;
+volatile int libraryCreationNotificationSent;
 
-void CL_CALLBACK test_notify_create_library_complete( cl_program program, void *userData )
+void CL_CALLBACK test_notify_create_library_complete(cl_program program,
+                                                     void *userData)
 {
-    if( userData == NULL || strcmp( (char *)userData, "create library" ) != 0 )
+    if (userData == NULL || strcmp((char *)userData, "create library") != 0)
     {
-        log_error( "ERROR: User data passed in to library creation notify function was not correct! (in %s:%d)\n", __FILE__, __LINE__ );
+        log_error("ERROR: User data passed in to library creation notify "
+                  "function was not correct! (in %s:%d)\n",
+                  __FILE__, __LINE__);
         libraryCreationNotificationSent = -1;
     }
     else
         libraryCreationNotificationSent = 1;
-    log_info( "\n   <-- library successfully created\n" );
+    log_info("\n   <-- library successfully created\n");
 }
 
-volatile int       linkNotificationSent;
+volatile int linkNotificationSent;
 
-void CL_CALLBACK test_notify_link_complete( cl_program program, void *userData )
+void CL_CALLBACK test_notify_link_complete(cl_program program, void *userData)
 {
-    if( userData == NULL || strcmp( (char *)userData, "linking" ) != 0 )
+    if (userData == NULL || strcmp((char *)userData, "linking") != 0)
     {
-        log_error( "ERROR: User data passed in to link notify function was not correct! (in %s:%d)\n", __FILE__, __LINE__ );
+        log_error("ERROR: User data passed in to link notify function was not "
+                  "correct! (in %s:%d)\n",
+                  __FILE__, __LINE__);
         linkNotificationSent = -1;
     }
     else
         linkNotificationSent = 1;
-    log_info( "\n   <-- program successfully linked\n" );
+    log_info("\n   <-- program successfully linked\n");
 }
 
-int test_large_compile_and_link_status_options_log(cl_context context, cl_device_id deviceID, cl_command_queue queue, unsigned int numLines)
+int test_large_compile_and_link_status_options_log(cl_context context,
+                                                   cl_device_id deviceID,
+                                                   cl_command_queue queue,
+                                                   unsigned int numLines)
 {
     int error;
     cl_program program;
-    cl_program * simple_kernels;
+    cl_program *simple_kernels;
     const char **lines;
     unsigned int i;
     char buffer[MAX_LINE_SIZE_IN_PROGRAM];
@@ -2984,263 +3686,349 @@
     cl_build_status status;
     size_t size_ret;
 
-    compileNotificationSent = libraryCreationNotificationSent = linkNotificationSent = 0;
+    compileNotificationSent = libraryCreationNotificationSent =
+        linkNotificationSent = 0;
 
-    simple_kernels = (cl_program*)malloc(numLines*sizeof(cl_program));
-    if (simple_kernels == NULL) {
-        log_error( "ERROR: Unable to allocate kernels array with %d kernels! (in %s:%d)\n", numLines, __FILE__, __LINE__);
+    simple_kernels = (cl_program *)malloc(numLines * sizeof(cl_program));
+    if (simple_kernels == NULL)
+    {
+        log_error("ERROR: Unable to allocate kernels array with %d kernels! "
+                  "(in %s:%d)\n",
+                  numLines, __FILE__, __LINE__);
         return -1;
     }
     /* First, allocate the array for our line pointers */
-    lines = (const char **)malloc( (2*numLines + 2) * sizeof( const char * ) );
-    if (lines == NULL) {
-        log_error( "ERROR: Unable to allocate lines array with %d lines! (in %s:%d)\n", (2*numLines + 2), __FILE__, __LINE__);
+    lines = (const char **)malloc((2 * numLines + 2) * sizeof(const char *));
+    if (lines == NULL)
+    {
+        log_error(
+            "ERROR: Unable to allocate lines array with %d lines! (in %s:%d)\n",
+            (2 * numLines + 2), __FILE__, __LINE__);
         return -1;
     }
 
-    for(i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
         sprintf(buffer, composite_kernel_extern_template, i);
         lines[i] = _strdup(buffer);
     }
     /* First and last lines are easy */
-    lines[ numLines ] = composite_kernel_start;
-    lines[ 2*numLines + 1] = composite_kernel_end;
+    lines[numLines] = composite_kernel_start;
+    lines[2 * numLines + 1] = composite_kernel_end;
 
     /* Fill the rest with templated kernels */
-    for(i = numLines + 1; i < 2*numLines + 1; i++ )
+    for (i = numLines + 1; i < 2 * numLines + 1; i++)
     {
         sprintf(buffer, composite_kernel_template, i - numLines - 1);
-        lines[ i ] = _strdup(buffer);
+        lines[i] = _strdup(buffer);
     }
 
     /* Try to create a program with these lines */
-    error = create_single_kernel_helper_create_program(context, &program, 2 * numLines + 2, lines);
-    if( program == NULL || error != CL_SUCCESS )
+    error = create_single_kernel_helper_create_program(context, &program,
+                                                       2 * numLines + 2, lines);
+    if (program == NULL || error != CL_SUCCESS)
     {
-        log_error( "ERROR: Unable to create long test program with %d lines! (%s) (in %s:%d)\n", numLines, IGetErrorString( error ), __FILE__, __LINE__ );
+        log_error("ERROR: Unable to create long test program with %d lines! "
+                  "(%s) (in %s:%d)\n",
+                  numLines, IGetErrorString(error), __FILE__, __LINE__);
         return -1;
     }
 
     /* Lets check that the compilation status is CL_BUILD_NONE */
-    error = clGetProgramBuildInfo( program, deviceID, CL_PROGRAM_BUILD_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Unable to get program compile status" );
+    error = clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_STATUS,
+                                  sizeof(status), &status, NULL);
+    test_error(error, "Unable to get program compile status");
     if (status != CL_BUILD_NONE)
     {
-        log_error( "ERROR: Expected compile status to be CL_BUILD_NONE prior to the beginning of the compilation! (status: %d in %s:%d)\n", (int)status, __FILE__, __LINE__ );
+        log_error("ERROR: Expected compile status to be CL_BUILD_NONE prior to "
+                  "the beginning of the compilation! (status: %d in %s:%d)\n",
+                  (int)status, __FILE__, __LINE__);
         return -1;
     }
 
     /* Compile it */
-    error = clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL, test_notify_compile_complete, (void *)"compilation");
-    test_error( error, "Unable to compile a simple program" );
+    error =
+        clCompileProgram(program, 1, &deviceID, NULL, 0, NULL, NULL,
+                         test_notify_compile_complete, (void *)"compilation");
+    test_error(error, "Unable to compile a simple program");
 
-    /* Wait for compile to complete (just keep polling, since we're just a test */
-    error = clGetProgramBuildInfo( program, deviceID, CL_PROGRAM_BUILD_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Unable to get program compile status" );
+    /* Wait for compile to complete (just keep polling, since we're just a test
+     */
+    error = clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_STATUS,
+                                  sizeof(status), &status, NULL);
+    test_error(error, "Unable to get program compile status");
 
-    while( (int)status == CL_BUILD_IN_PROGRESS )
+    while ((int)status == CL_BUILD_IN_PROGRESS)
     {
-        log_info( "\n  -- still waiting for compile... (status is %d)", status );
-        sleep( 1 );
-        error = clGetProgramBuildInfo( program, deviceID, CL_PROGRAM_BUILD_STATUS, sizeof( status ), &status, NULL );
-        test_error( error, "Unable to get program compile status" );
+        log_info("\n  -- still waiting for compile... (status is %d)", status);
+        sleep(1);
+        error =
+            clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_STATUS,
+                                  sizeof(status), &status, NULL);
+        test_error(error, "Unable to get program compile status");
     }
-    if( status != CL_BUILD_SUCCESS )
+    if (status != CL_BUILD_SUCCESS)
     {
-        log_error( "ERROR: compile failed! (status: %d in %s:%d)\n", (int)status, __FILE__, __LINE__ );
+        log_error("ERROR: compile failed! (status: %d in %s:%d)\n", (int)status,
+                  __FILE__, __LINE__);
         return -1;
     }
 
-    error = clGetProgramBuildInfo( program, deviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &size_ret );
-    test_error( error, "Device failed to return compile log size" );
+    error = clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, 0,
+                                  NULL, &size_ret);
+    test_error(error, "Device failed to return compile log size");
     compile_log = (char *)malloc(size_ret);
-    error = clGetProgramBuildInfo( program, deviceID, CL_PROGRAM_BUILD_LOG, size_ret, compile_log, NULL );
-    if (error != CL_SUCCESS){
-        log_error("Device failed to return a compile log (in %s:%d)\n", __FILE__, __LINE__);
+    error = clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG,
+                                  size_ret, compile_log, NULL);
+    if (error != CL_SUCCESS)
+    {
+        log_error("Device failed to return a compile log (in %s:%d)\n",
+                  __FILE__, __LINE__);
         test_error(error, "clGetProgramBuildInfo CL_PROGRAM_BUILD_LOG failed");
     }
     log_info("BUILD LOG: %s\n", compile_log);
     free(compile_log);
 
-    error = clGetProgramBuildInfo( program, deviceID, CL_PROGRAM_BUILD_OPTIONS, 0, NULL, &size_ret );
+    error = clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_OPTIONS,
+                                  0, NULL, &size_ret);
     test_error(error, "Device failed to return compile options size");
     compile_options = (char *)malloc(size_ret);
-    error = clGetProgramBuildInfo( program, deviceID, CL_PROGRAM_BUILD_OPTIONS, size_ret, compile_options, NULL );
-    test_error(error, "Device failed to return compile options.\nclGetProgramBuildInfo CL_PROGRAM_BUILD_OPTIONS failed");
+    error = clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_OPTIONS,
+                                  size_ret, compile_options, NULL);
+    test_error(
+        error,
+        "Device failed to return compile options.\nclGetProgramBuildInfo "
+        "CL_PROGRAM_BUILD_OPTIONS failed");
 
     log_info("BUILD OPTIONS: %s\n", compile_options);
     free(compile_options);
 
     /* Create and compile templated kernels */
-    for( i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
         sprintf(buffer, simple_kernel_template, i);
-        const char* kernel_source = _strdup(buffer);
-        error = create_single_kernel_helper_create_program(context, &simple_kernels[i], 1, &kernel_source);
-        if( simple_kernels[i] == NULL || error != CL_SUCCESS )
+        const char *kernel_source = _strdup(buffer);
+        error = create_single_kernel_helper_create_program(
+            context, &simple_kernels[i], 1, &kernel_source);
+        if (simple_kernels[i] == NULL || error != CL_SUCCESS)
         {
-            log_error( "ERROR: Unable to create long test program with %d lines! (%s in %s:%d)", numLines, IGetErrorString( error ), __FILE__, __LINE__ );
+            log_error("ERROR: Unable to create long test program with %d "
+                      "lines! (%s in %s:%d)",
+                      numLines, IGetErrorString(error), __FILE__, __LINE__);
             return -1;
         }
 
         /* Compile it */
-        error = clCompileProgram(simple_kernels[i], 1, &deviceID, NULL, 0, NULL, NULL, NULL, NULL);
-        test_error( error, "Unable to compile a simple program" );
+        error = clCompileProgram(simple_kernels[i], 1, &deviceID, NULL, 0, NULL,
+                                 NULL, NULL, NULL);
+        test_error(error, "Unable to compile a simple program");
 
-        free((void*)kernel_source);
+        free((void *)kernel_source);
     }
 
     /* Create library out of compiled templated kernels */
-    cl_program my_newly_minted_library = clLinkProgram(context, 1, &deviceID, "-create-library", numLines, simple_kernels, test_notify_create_library_complete, (void *)"create library", &error);
-    test_error( error, "Unable to create a multi-line library" );
+    cl_program my_newly_minted_library = clLinkProgram(
+        context, 1, &deviceID, "-create-library", numLines, simple_kernels,
+        test_notify_create_library_complete, (void *)"create library", &error);
+    test_error(error, "Unable to create a multi-line library");
 
-    /* Wait for library creation to complete (just keep polling, since we're just a test */
-    error = clGetProgramBuildInfo( my_newly_minted_library, deviceID, CL_PROGRAM_BUILD_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Unable to get library creation link status" );
+    /* Wait for library creation to complete (just keep polling, since we're
+     * just a test */
+    error = clGetProgramBuildInfo(my_newly_minted_library, deviceID,
+                                  CL_PROGRAM_BUILD_STATUS, sizeof(status),
+                                  &status, NULL);
+    test_error(error, "Unable to get library creation link status");
 
-    while( (int)status == CL_BUILD_IN_PROGRESS )
+    while ((int)status == CL_BUILD_IN_PROGRESS)
     {
-        log_info( "\n  -- still waiting for library creation... (status is %d)", status );
-        sleep( 1 );
-        error = clGetProgramBuildInfo( my_newly_minted_library, deviceID, CL_PROGRAM_BUILD_STATUS, sizeof( status ), &status, NULL );
-        test_error( error, "Unable to get library creation link status" );
+        log_info("\n  -- still waiting for library creation... (status is %d)",
+                 status);
+        sleep(1);
+        error = clGetProgramBuildInfo(my_newly_minted_library, deviceID,
+                                      CL_PROGRAM_BUILD_STATUS, sizeof(status),
+                                      &status, NULL);
+        test_error(error, "Unable to get library creation link status");
     }
-    if( status != CL_BUILD_SUCCESS )
+    if (status != CL_BUILD_SUCCESS)
     {
-        log_error( "ERROR: library creation failed! (status: %d in %s:%d)\n", (int)status, __FILE__, __LINE__ );
+        log_error("ERROR: library creation failed! (status: %d in %s:%d)\n",
+                  (int)status, __FILE__, __LINE__);
         return -1;
     }
-    error = clGetProgramBuildInfo( my_newly_minted_library, deviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &size_ret );
-    test_error( error, "Device failed to return a library creation log size" );
+    error = clGetProgramBuildInfo(my_newly_minted_library, deviceID,
+                                  CL_PROGRAM_BUILD_LOG, 0, NULL, &size_ret);
+    test_error(error, "Device failed to return a library creation log size");
     library_log = (char *)malloc(size_ret);
-    error = clGetProgramBuildInfo( my_newly_minted_library, deviceID, CL_PROGRAM_BUILD_LOG, size_ret, library_log, NULL );
-    if (error != CL_SUCCESS) {
-        log_error("Device failed to return a library creation log (in %s:%d)\n", __FILE__, __LINE__);
+    error = clGetProgramBuildInfo(my_newly_minted_library, deviceID,
+                                  CL_PROGRAM_BUILD_LOG, size_ret, library_log,
+                                  NULL);
+    if (error != CL_SUCCESS)
+    {
+        log_error("Device failed to return a library creation log (in %s:%d)\n",
+                  __FILE__, __LINE__);
         test_error(error, "clGetProgramBuildInfo CL_PROGRAM_BUILD_LOG failed");
     }
     log_info("CREATE LIBRARY LOG: %s\n", library_log);
     free(library_log);
 
-    error = clGetProgramBuildInfo( my_newly_minted_library, deviceID, CL_PROGRAM_BUILD_OPTIONS, 0, NULL, &size_ret );
+    error = clGetProgramBuildInfo(my_newly_minted_library, deviceID,
+                                  CL_PROGRAM_BUILD_OPTIONS, 0, NULL, &size_ret);
     test_error(error, "Device failed to return library creation options size");
     library_options = (char *)malloc(size_ret);
-    error = clGetProgramBuildInfo( my_newly_minted_library, deviceID, CL_PROGRAM_BUILD_OPTIONS, size_ret, library_options, NULL );
-    test_error(error, "Device failed to return library creation options.\nclGetProgramBuildInfo CL_PROGRAM_BUILD_OPTIONS failed");
+    error = clGetProgramBuildInfo(my_newly_minted_library, deviceID,
+                                  CL_PROGRAM_BUILD_OPTIONS, size_ret,
+                                  library_options, NULL);
+    test_error(
+        error,
+        "Device failed to return library creation "
+        "options.\nclGetProgramBuildInfo CL_PROGRAM_BUILD_OPTIONS failed");
 
     log_info("CREATE LIBRARY OPTIONS: %s\n", library_options);
     free(library_options);
 
-    /* Link the program that calls the kernels and the library that contains them */
+    /* Link the program that calls the kernels and the library that contains
+     * them */
     cl_program programs[2] = { program, my_newly_minted_library };
-    cl_program my_newly_linked_program = clLinkProgram(context, 1, &deviceID, NULL, 2, programs, test_notify_link_complete, (void *)"linking", &error);
-    test_error( error, "Unable to link a program with a library" );
+    cl_program my_newly_linked_program =
+        clLinkProgram(context, 1, &deviceID, NULL, 2, programs,
+                      test_notify_link_complete, (void *)"linking", &error);
+    test_error(error, "Unable to link a program with a library");
 
-    /* Wait for linking to complete (just keep polling, since we're just a test */
-    error = clGetProgramBuildInfo( my_newly_linked_program, deviceID, CL_PROGRAM_BUILD_STATUS, sizeof( status ), &status, NULL );
-    test_error( error, "Unable to get program link status" );
+    /* Wait for linking to complete (just keep polling, since we're just a test
+     */
+    error = clGetProgramBuildInfo(my_newly_linked_program, deviceID,
+                                  CL_PROGRAM_BUILD_STATUS, sizeof(status),
+                                  &status, NULL);
+    test_error(error, "Unable to get program link status");
 
-    while( (int)status == CL_BUILD_IN_PROGRESS )
+    while ((int)status == CL_BUILD_IN_PROGRESS)
     {
-        log_info( "\n  -- still waiting for program linking... (status is %d)", status );
-        sleep( 1 );
-        error = clGetProgramBuildInfo( my_newly_linked_program, deviceID, CL_PROGRAM_BUILD_STATUS, sizeof( status ), &status, NULL );
-        test_error( error, "Unable to get program link status" );
+        log_info("\n  -- still waiting for program linking... (status is %d)",
+                 status);
+        sleep(1);
+        error = clGetProgramBuildInfo(my_newly_linked_program, deviceID,
+                                      CL_PROGRAM_BUILD_STATUS, sizeof(status),
+                                      &status, NULL);
+        test_error(error, "Unable to get program link status");
     }
-    if( status != CL_BUILD_SUCCESS )
+    if (status != CL_BUILD_SUCCESS)
     {
-        log_error( "ERROR: program linking failed! (status: %d in %s:%d)\n", (int)status, __FILE__, __LINE__ );
+        log_error("ERROR: program linking failed! (status: %d in %s:%d)\n",
+                  (int)status, __FILE__, __LINE__);
         return -1;
     }
-    error = clGetProgramBuildInfo( my_newly_linked_program, deviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &size_ret );
-    test_error( error, "Device failed to return a linking log size" );
+    error = clGetProgramBuildInfo(my_newly_linked_program, deviceID,
+                                  CL_PROGRAM_BUILD_LOG, 0, NULL, &size_ret);
+    test_error(error, "Device failed to return a linking log size");
     linking_log = (char *)malloc(size_ret);
-    error = clGetProgramBuildInfo( my_newly_linked_program, deviceID, CL_PROGRAM_BUILD_LOG, size_ret, linking_log, NULL );
-    if (error != CL_SUCCESS){
-        log_error("Device failed to return a linking log (in %s:%d).\n", __FILE__, __LINE__);
+    error = clGetProgramBuildInfo(my_newly_linked_program, deviceID,
+                                  CL_PROGRAM_BUILD_LOG, size_ret, linking_log,
+                                  NULL);
+    if (error != CL_SUCCESS)
+    {
+        log_error("Device failed to return a linking log (in %s:%d).\n",
+                  __FILE__, __LINE__);
         test_error(error, "clGetProgramBuildInfo CL_PROGRAM_BUILD_LOG failed");
     }
     log_info("BUILDING LOG: %s\n", linking_log);
     free(linking_log);
 
-    error = clGetProgramBuildInfo( my_newly_linked_program, deviceID, CL_PROGRAM_BUILD_OPTIONS, 0, NULL, &size_ret );
+    error = clGetProgramBuildInfo(my_newly_linked_program, deviceID,
+                                  CL_PROGRAM_BUILD_OPTIONS, 0, NULL, &size_ret);
     test_error(error, "Device failed to return linking options size");
     linking_options = (char *)malloc(size_ret);
-    error = clGetProgramBuildInfo( my_newly_linked_program, deviceID, CL_PROGRAM_BUILD_OPTIONS, size_ret, linking_options, NULL );
-    test_error(error, "Device failed to return linking options.\nclGetProgramBuildInfo CL_PROGRAM_BUILD_OPTIONS failed");
+    error = clGetProgramBuildInfo(my_newly_linked_program, deviceID,
+                                  CL_PROGRAM_BUILD_OPTIONS, size_ret,
+                                  linking_options, NULL);
+    test_error(
+        error,
+        "Device failed to return linking options.\nclGetProgramBuildInfo "
+        "CL_PROGRAM_BUILD_OPTIONS failed");
 
     log_info("BUILDING OPTIONS: %s\n", linking_options);
     free(linking_options);
 
     // Create the composite kernel
-    cl_kernel kernel = clCreateKernel(my_newly_linked_program, "CompositeKernel", &error);
-    test_error( error, "Unable to create a composite kernel" );
+    cl_kernel kernel =
+        clCreateKernel(my_newly_linked_program, "CompositeKernel", &error);
+    test_error(error, "Unable to create a composite kernel");
 
     // Run the composite kernel and verify the results
     error = verifyCopyBuffer(context, queue, kernel);
-    if (error != CL_SUCCESS)
-        return error;
+    if (error != CL_SUCCESS) return error;
 
     /* All done! */
-    error = clReleaseKernel( kernel );
-    test_error( error, "Unable to release kernel object" );
+    error = clReleaseKernel(kernel);
+    test_error(error, "Unable to release kernel object");
 
-    error = clReleaseProgram( program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(program);
+    test_error(error, "Unable to release program object");
 
-    for(i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
-        free( (void*)lines[i] );
-        free( (void*)lines[i+numLines+1] );
+        free((void *)lines[i]);
+        free((void *)lines[i + numLines + 1]);
     }
-    free( lines );
+    free(lines);
 
-    for(i = 0; i < numLines; i++)
+    for (i = 0; i < numLines; i++)
     {
-        error = clReleaseProgram( simple_kernels[i] );
-        test_error( error, "Unable to release program object" );
+        error = clReleaseProgram(simple_kernels[i]);
+        test_error(error, "Unable to release program object");
     }
-    free( simple_kernels );
+    free(simple_kernels);
 
-    error = clReleaseProgram( my_newly_minted_library );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_minted_library);
+    test_error(error, "Unable to release program object");
 
-    error = clReleaseProgram( my_newly_linked_program );
-    test_error( error, "Unable to release program object" );
+    error = clReleaseProgram(my_newly_linked_program);
+    test_error(error, "Unable to release program object");
 
     return 0;
 }
 
-int test_compile_and_link_status_options_log(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_compile_and_link_status_options_log(cl_device_id deviceID,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements)
 {
-    unsigned int toTest[] = { 256, 0 }; //512, 1024, 8192, 16384, 32768, 0 };
+    unsigned int toTest[] = { 256, 0 }; // 512, 1024, 8192, 16384, 32768, 0 };
     unsigned int i;
 
-    log_info( "Testing Compile and Link Status, Options and Logging ...this might take awhile...\n" );
+    log_info("Testing Compile and Link Status, Options and Logging ...this "
+             "might take awhile...\n");
 
-    for( i = 0; toTest[ i ] != 0; i++ )
+    for (i = 0; toTest[i] != 0; i++)
     {
-        log_info( "   %d...\n", toTest[ i ] );
+        log_info("   %d...\n", toTest[i]);
 
 #if defined(_WIN32)
         clock_t start = clock();
-#elif  defined(__linux__) || defined(__APPLE__)
-    timeval time1, time2;
-    gettimeofday(&time1, NULL);
+#elif defined(__linux__) || defined(__APPLE__)
+        timeval time1, time2;
+        gettimeofday(&time1, NULL);
 #endif
 
-        if( test_large_compile_and_link_status_options_log( context, deviceID, queue, toTest[ i ] ) != 0 )
+        if (test_large_compile_and_link_status_options_log(context, deviceID,
+                                                           queue, toTest[i])
+            != 0)
         {
-            log_error( "ERROR: large program compilation, linking, status, options and logging test failed for %d lines! (in %s:%d)\n", toTest[ i ], __FILE__, __LINE__ );
+            log_error(
+                "ERROR: large program compilation, linking, status, options "
+                "and logging test failed for %d lines! (in %s:%d)\n",
+                toTest[i], __FILE__, __LINE__);
             return -1;
         }
 
 #if defined(_WIN32)
         clock_t end = clock();
-    log_perf( (float)( end - start ) / (float)CLOCKS_PER_SEC, false, "clock() time in secs", "%d lines", toTest[i] );
-#elif  defined(__linux__) || defined(__APPLE__)
-    gettimeofday(&time2, NULL);
-    log_perf( (float)(float)(time2.tv_sec  - time1.tv_sec) + 1.0e-6 * (time2.tv_usec - time1.tv_usec) , false, "wall time in secs", "%d lines", toTest[i] );
+        log_perf((float)(end - start) / (float)CLOCKS_PER_SEC, false,
+                 "clock() time in secs", "%d lines", toTest[i]);
+#elif defined(__linux__) || defined(__APPLE__)
+        gettimeofday(&time2, NULL);
+        log_perf((float)(float)(time2.tv_sec - time1.tv_sec)
+                     + 1.0e-6 * (time2.tv_usec - time1.tv_usec),
+                 false, "wall time in secs", "%d lines", toTest[i]);
 #endif
     }
 
diff --git a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
index d53af8d..89626b7 100644
--- a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
+++ b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
@@ -76,6 +76,7 @@
     "cl_khr_device_uuid",
     "cl_khr_pci_bus_info",
     "cl_khr_suggested_local_work_size",
+    "cl_khr_expect_assume",
     "cl_khr_spirv_linkonce_odr",
     "cl_khr_semaphore",
     "cl_khr_external_semaphore",
diff --git a/test_conformance/computeinfo/main.cpp b/test_conformance/computeinfo/main.cpp
index 9cecabe..b1d73af 100644
--- a/test_conformance/computeinfo/main.cpp
+++ b/test_conformance/computeinfo/main.cpp
@@ -439,8 +439,8 @@
                 err = clGetPlatformInfo(platform, info->opcode, config_size_set,
                                         &info->config.cl_name_version_single,
                                         &config_size_ret);
+                size_err = config_size_set != config_size_ret;
             }
-            size_err = config_size_set != config_size_ret;
             break;
         default:
             log_error("Unknown config type: %d\n", info->config_type);
@@ -585,8 +585,8 @@
                 err = clGetDeviceInfo(device, info->opcode, config_size_set,
                                       &info->config.cl_name_version_single,
                                       &config_size_ret);
+                size_err = config_size_set != config_size_ret;
             }
-            size_err = config_size_set != config_size_ret;
             break;
         default:
             log_error("Unknown config type: %d\n", info->config_type);
diff --git a/test_conformance/conversions/CMakeLists.txt b/test_conformance/conversions/CMakeLists.txt
index cc019b2..8ed3ba1 100644
--- a/test_conformance/conversions/CMakeLists.txt
+++ b/test_conformance/conversions/CMakeLists.txt
@@ -16,6 +16,6 @@
         COMPILE_FLAGS -march=i686)
 endif(NOT CMAKE_CL_64 AND NOT MSVC AND NOT ANDROID)
 
-set_gnulike_module_compile_flags("-Wno-unused-but-set-variable")
+set_gnulike_module_compile_flags("-Wno-unused-but-set-variable -Wno-sign-compare")
 
 include(../CMakeCommon.txt)
diff --git a/test_conformance/conversions/basic_test_conversions.cpp b/test_conformance/conversions/basic_test_conversions.cpp
index dfb3227..1020638 100644
--- a/test_conformance/conversions/basic_test_conversions.cpp
+++ b/test_conformance/conversions/basic_test_conversions.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -15,2243 +15,1496 @@
 //
 #include "harness/testHarness.h"
 #include "harness/compat.h"
+#include "harness/ThreadPool.h"
+
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#include <mach/mach_time.h>
+#endif
+
+#if defined(__linux__)
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/sysctl.h>
+#endif
+#if defined(__linux__)
+#include <sys/param.h>
+#include <libgen.h>
+#endif
+
+#if defined(__MINGW32__)
+#include <sys/param.h>
+#endif
+
+#include <sstream>
+#include <stdarg.h>
+#if !defined(_WIN32)
+#include <libgen.h>
+#include <sys/mman.h>
+#endif
+#include <time.h>
+
+#include <algorithm>
+
+#include <vector>
+#include <type_traits>
 
 #include "basic_test_conversions.h"
-#include <limits.h>
-#include <string.h>
 
-#include "harness/mt19937.h"
-
-#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
-#include "fplib.h"
-#endif
-
-#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
-/* Rounding modes and saturation for use with qcom 64 bit to float conversion library */
-    bool            qcom_sat;
-    roundingMode    qcom_rm;
-#endif
-
-static inline cl_ulong random64( MTdata d );
-
-#if defined (_WIN32)
-    #include <mmintrin.h>
-    #include <emmintrin.h>
+#if defined(_WIN32)
+#include <mmintrin.h>
+#include <emmintrin.h>
 #else // !_WIN32
-#if defined (__SSE__ )
-    #include <xmmintrin.h>
+#if defined(__SSE__)
+#include <xmmintrin.h>
 #endif
-#if defined (__SSE2__ )
-    #include <emmintrin.h>
+#if defined(__SSE2__)
+#include <emmintrin.h>
 #endif
 #endif // _WIN32
 
-const char *gTypeNames[ kTypeCount ] = {
-                                            "uchar", "char",
-                                            "ushort", "short",
-                                            "uint",   "int",
-                                            "float", "double",
-                                            "ulong", "long"
-                                        };
-
-const char *gRoundingModeNames[ kRoundingModeCount ] = {
-                                                            "",
-                                                            "_rte",
-                                                            "_rtp",
-                                                            "_rtn",
-                                                            "_rtz"
-                                                        };
-
-const char *gSaturationNames[ 2 ] = { "", "_sat" };
-
-size_t gTypeSizes[ kTypeCount ] = {
-                                    sizeof( cl_uchar ), sizeof( cl_char ),
-                                    sizeof( cl_ushort ), sizeof( cl_short ),
-                                    sizeof( cl_uint ), sizeof( cl_int ),
-                                    sizeof( cl_float ), sizeof( cl_double ),
-                                    sizeof( cl_ulong ), sizeof( cl_long ),
-                                };
-
-long lrintf_clamped( float f );
-long lrintf_clamped( float f )
-{
-    static const float magic[2] = { MAKE_HEX_FLOAT( 0x1.0p23f, 0x1, 23), - MAKE_HEX_FLOAT( 0x1.0p23f, 0x1, 23) };
-
-    if( f >= -(float) LONG_MIN )
-        return LONG_MAX;
-
-    if( f <= (float) LONG_MIN )
-        return LONG_MIN;
-
-    // Round fractional values to integer in round towards nearest mode
-    if( fabsf(f) < MAKE_HEX_FLOAT( 0x1.0p23f, 0x1, 23 ) )
-    {
-        volatile float x = f;
-        float magicVal = magic[ f < 0 ];
-
-#if defined( __SSE__ ) || defined (_WIN32)
-        // Defeat x87 based arithmetic, which cant do FTZ, and will round this incorrectly
-        __m128 v = _mm_set_ss( x );
-        __m128 m = _mm_set_ss( magicVal );
-        v = _mm_add_ss( v, m );
-        v = _mm_sub_ss( v, m );
-        _mm_store_ss( (float*) &x, v );
+cl_context gContext = NULL;
+cl_command_queue gQueue = NULL;
+int gStartTestNumber = -1;
+int gEndTestNumber = 0;
+#if defined(__APPLE__)
+int gTimeResults = 1;
 #else
-        x += magicVal;
-        x -= magicVal;
+int gTimeResults = 0;
 #endif
-        f = x;
-    }
+int gReportAverageTimes = 0;
+void *gIn = NULL;
+void *gRef = NULL;
+void *gAllowZ = NULL;
+void *gOut[kCallStyleCount] = { NULL };
+cl_mem gInBuffer;
+cl_mem gOutBuffers[kCallStyleCount];
+size_t gComputeDevices = 0;
+uint32_t gDeviceFrequency = 0;
+int gWimpyMode = 0;
+int gWimpyReductionFactor = 128;
+int gSkipTesting = 0;
+int gForceFTZ = 0;
+int gIsRTZ = 0;
+uint32_t gSimdSize = 1;
+int gHasDouble = 0;
+int gTestDouble = 1;
+const char *sizeNames[] = { "", "", "2", "3", "4", "8", "16" };
+int vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 };
+int gMinVectorSize = 0;
+int gMaxVectorSize = sizeof(vectorSizes) / sizeof(vectorSizes[0]);
+MTdata gMTdata;
+const char **argList = NULL;
+int argCount = 0;
 
-    return (long) f;
-}
 
-long long llrintf_clamped( float f );
-long long llrintf_clamped( float f )
-{
-    static const float magic[2] = { MAKE_HEX_FLOAT( 0x1.0p23f, 0x1, 23), - MAKE_HEX_FLOAT( 0x1.0p23f, 0x1, 23) };
+double SubtractTime(uint64_t endTime, uint64_t startTime);
 
-    if( f >= -(float) LLONG_MIN )
-        return LLONG_MAX;
 
-    if( f <= (float) LLONG_MIN )
-        return LLONG_MIN;
+// clang-format off
+// for readability sake keep this section unformatted
 
-    // Round fractional values to integer in round towards nearest mode
-    if( fabsf(f) < MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23) )
-    {
-        volatile float x = f;
-        float magicVal = magic[ f < 0 ];
-#if defined( __SSE__ ) || defined (_WIN32)
-        // Defeat x87 based arithmetic, which cant do FTZ, and will round this incorrectly
-        __m128 v = _mm_set_ss( x );
-        __m128 m = _mm_set_ss( magicVal );
-        v = _mm_add_ss( v, m );
-        v = _mm_sub_ss( v, m );
-        _mm_store_ss( (float*) &x, v );
-#else
-        x += magicVal;
-        x -= magicVal;
-#endif
-        f = x;
-    }
-
-    return (long long) f;
-}
-
-long lrint_clamped( double f );
-long lrint_clamped( double f )
-{
-    static const double magic[2] = { MAKE_HEX_DOUBLE(0x1.0p52, 0x1LL, 52), MAKE_HEX_DOUBLE(-0x1.0p52, -0x1LL, 52) };
-
-    if( sizeof( long ) > 4 )
-    {
-        if( f >= -(double) LONG_MIN )
-            return LONG_MAX;
-    }
-    else
-    {
-        if( f >= LONG_MAX )
-            return LONG_MAX;
-    }
-
-    if( f <= (double) LONG_MIN )
-        return LONG_MIN;
-
-    // Round fractional values to integer in round towards nearest mode
-    if( fabs(f) < MAKE_HEX_DOUBLE(0x1.0p52, 0x1LL, 52) )
-    {
-        volatile double x = f;
-        double magicVal = magic[ f < 0 ];
-#if defined( __SSE2__ ) || defined (_MSC_VER)
-        // Defeat x87 based arithmetic, which cant do FTZ, and will round this incorrectly
-        __m128d v = _mm_set_sd( x );
-        __m128d m = _mm_set_sd( magicVal );
-        v = _mm_add_sd( v, m );
-        v = _mm_sub_sd( v, m );
-        _mm_store_sd( (double*) &x, v );
-#else
-        x += magicVal;
-        x -= magicVal;
-#endif
-        f = x;
-    }
-
-    return (long) f;
-}
-
-long long llrint_clamped( double f );
-long long llrint_clamped( double f )
-{
-    static const double magic[2] = { MAKE_HEX_DOUBLE(0x1.0p52, 0x1LL, 52), MAKE_HEX_DOUBLE(-0x1.0p52, -0x1LL, 52) };
-
-    if( f >= -(double) LLONG_MIN )
-        return LLONG_MAX;
-
-    if( f <= (double) LLONG_MIN )
-        return LLONG_MIN;
-
-    // Round fractional values to integer in round towards nearest mode
-    if( fabs(f) < MAKE_HEX_DOUBLE(0x1.0p52, 0x1LL, 52) )
-    {
-        volatile double x = f;
-        double magicVal = magic[ f < 0 ];
-#if defined( __SSE2__ ) || defined (_MSC_VER)
-        // Defeat x87 based arithmetic, which cant do FTZ, and will round this incorrectly
-        __m128d v = _mm_set_sd( x );
-        __m128d m = _mm_set_sd( magicVal );
-        v = _mm_add_sd( v, m );
-        v = _mm_sub_sd( v, m );
-        _mm_store_sd( (double*) &x, v );
-#else
-        x += magicVal;
-        x -= magicVal;
-#endif
-        f = x;
-    }
-
-    return (long long) f;
-}
-
-
-/*
-    Names created as:
-
-    #include <stdio.h>
-
-    const char *names[] = { "uchar", "char", "ushort", "short", "uint", "int", "float", "double", "ulong", "long" };
-
-    int main( void )
-    {
-
-        int i,j;
-
-        for( i = 0; i < sizeof( names ) / sizeof( names[0] ); i++ )
-            for( j = 0; j < sizeof( names ) / sizeof( names[0] ); j++ )
-            {
-                if( j == i )
-                    continue;
-
-                vlog( "void %s2%s( void *, void *);\n", names[i], names[j] );
-            }
-
-
-        return 0;
-    }
-*/
-
-static float my_fabsf( float x );
-static double my_fabs( double x );
-
-
-
-static void uchar2char( void *, void *);
-static void uchar2ushort( void *, void *);
-static void uchar2short( void *, void *);
-static void uchar2uint( void *, void *);
-static void uchar2int( void *, void *);
-static void uchar2float( void *, void *);
-static void uchar2double( void *, void *);
-static void uchar2ulong( void *, void *);
-static void uchar2long( void *, void *);
-static void char2uchar( void *, void *);
-static void char2ushort( void *, void *);
-static void char2short( void *, void *);
-static void char2uint( void *, void *);
-static void char2int( void *, void *);
-static void char2float( void *, void *);
-static void char2double( void *, void *);
-static void char2ulong( void *, void *);
-static void char2long( void *, void *);
-static void ushort2uchar( void *, void *);
-static void ushort2char( void *, void *);
-static void ushort2short( void *, void *);
-static void ushort2uint( void *, void *);
-static void ushort2int( void *, void *);
-static void ushort2float( void *, void *);
-static void ushort2double( void *, void *);
-static void ushort2ulong( void *, void *);
-static void ushort2long( void *, void *);
-static void short2uchar( void *, void *);
-static void short2char( void *, void *);
-static void short2ushort( void *, void *);
-static void short2uint( void *, void *);
-static void short2int( void *, void *);
-static void short2float( void *, void *);
-static void short2double( void *, void *);
-static void short2ulong( void *, void *);
-static void short2long( void *, void *);
-static void uint2uchar( void *, void *);
-static void uint2char( void *, void *);
-static void uint2ushort( void *, void *);
-static void uint2short( void *, void *);
-static void uint2int( void *, void *);
-static void uint2float( void *, void *);
-static void uint2double( void *, void *);
-static void uint2ulong( void *, void *);
-static void uint2long( void *, void *);
-static void int2uchar( void *, void *);
-static void int2char( void *, void *);
-static void int2ushort( void *, void *);
-static void int2short( void *, void *);
-static void int2uint( void *, void *);
-static void int2float( void *, void *);
-static void int2double( void *, void *);
-static void int2ulong( void *, void *);
-static void int2long( void *, void *);
-static void float2uchar( void *, void *);
-static void float2char( void *, void *);
-static void float2ushort( void *, void *);
-static void float2short( void *, void *);
-static void float2uint( void *, void *);
-static void float2int( void *, void *);
-static void float2double( void *, void *);
-static void float2ulong( void *, void *);
-static void float2long( void *, void *);
-static void double2uchar( void *, void *);
-static void double2char( void *, void *);
-static void double2ushort( void *, void *);
-static void double2short( void *, void *);
-static void double2uint( void *, void *);
-static void double2int( void *, void *);
-static void double2float( void *, void *);
-static void double2ulong( void *, void *);
-static void double2long( void *, void *);
-static void ulong2uchar( void *, void *);
-static void ulong2char( void *, void *);
-static void ulong2ushort( void *, void *);
-static void ulong2short( void *, void *);
-static void ulong2uint( void *, void *);
-static void ulong2int( void *, void *);
-static void ulong2float( void *, void *);
-static void ulong2double( void *, void *);
-static void ulong2long( void *, void *);
-static void long2uchar( void *, void *);
-static void long2char( void *, void *);
-static void long2ushort( void *, void *);
-static void long2short( void *, void *);
-static void long2uint( void *, void *);
-static void long2int( void *, void *);
-static void long2float( void *, void *);
-static void long2double( void *, void *);
-static void long2ulong( void *, void *);
-
-/*
-    Conversion list created as
-
-    #include <stdio.h>
-
-    const char *names[] = { "uchar", "char", "ushort", "short", "uint", "int", "float", "double", "ulong", "long" };
-
-    int main( void )
-    {
-
-        int i,j;
-
-        for( i = 0; i < sizeof( names ) / sizeof( names[0] ); i++ )
-        {
-            vlog( "{ " );
-            for( j = 0; j < sizeof( names ) / sizeof( names[0] ); j++ )
-            {
-                if( j == i )
-                    vlog( "          NULL, " );
-                else
-                {
-                    char s[64];
-                    sprintf( s, "%s2%s,", names[j], names[i] );
-                    vlog( "%15s ", s );
-                }
-            }
-            vlog( "},\n" );
-        }
-
-        return 0;
-    }
-
- */
-/*
-Convert gConversions[kTypeCount][kTypeCount] = {
-{           NULL,     char2uchar,   ushort2uchar,    short2uchar,     uint2uchar,      int2uchar,    float2uchar,   double2uchar,    ulong2uchar,     long2uchar, },
-{     uchar2char,           NULL,    ushort2char,     short2char,      uint2char,       int2char,     float2char,    double2char,     ulong2char,      long2char, },
-{   uchar2ushort,    char2ushort,           NULL,   short2ushort,    uint2ushort,     int2ushort,   float2ushort,  double2ushort,   ulong2ushort,    long2ushort, },
-{    uchar2short,     char2short,   ushort2short,           NULL,     uint2short,      int2short,    float2short,   double2short,    ulong2short,     long2short, },
-{     uchar2uint,      char2uint,    ushort2uint,     short2uint,           NULL,       int2uint,     float2uint,    double2uint,     ulong2uint,      long2uint, },
-{      uchar2int,       char2int,     ushort2int,      short2int,       uint2int,           NULL,      float2int,     double2int,      ulong2int,       long2int, },
-{    uchar2float,     char2float,   ushort2float,    short2float,     uint2float,      int2float,           NULL,   double2float,    ulong2float,     long2float, },
-{   uchar2double,    char2double,  ushort2double,   short2double,    uint2double,     int2double,   float2double,           NULL,   ulong2double,    long2double, },
-{    uchar2ulong,     char2ulong,   ushort2ulong,    short2ulong,     uint2ulong,      int2ulong,    float2ulong,   double2ulong,           NULL,     long2ulong, },
-{     uchar2long,      char2long,    ushort2long,     short2long,      uint2long,       int2long,     float2long,    double2long,     ulong2long,           NULL, } };
-*/
-
-static void uchar2char_sat( void *, void *);
-static void uchar2ushort_sat( void *, void *);
-static void uchar2short_sat( void *, void *);
-static void uchar2uint_sat( void *, void *);
-static void uchar2int_sat( void *, void *);
-static void uchar2float_sat( void *, void *);
-static void uchar2double_sat( void *, void *);
-static void uchar2ulong_sat( void *, void *);
-static void uchar2long_sat( void *, void *);
-static void char2uchar_sat( void *, void *);
-static void char2ushort_sat( void *, void *);
-static void char2short_sat( void *, void *);
-static void char2uint_sat( void *, void *);
-static void char2int_sat( void *, void *);
-static void char2float_sat( void *, void *);
-static void char2double_sat( void *, void *);
-static void char2ulong_sat( void *, void *);
-static void char2long_sat( void *, void *);
-static void ushort2uchar_sat( void *, void *);
-static void ushort2char_sat( void *, void *);
-static void ushort2short_sat( void *, void *);
-static void ushort2uint_sat( void *, void *);
-static void ushort2int_sat( void *, void *);
-static void ushort2float_sat( void *, void *);
-static void ushort2double_sat( void *, void *);
-static void ushort2ulong_sat( void *, void *);
-static void ushort2long_sat( void *, void *);
-static void short2uchar_sat( void *, void *);
-static void short2char_sat( void *, void *);
-static void short2ushort_sat( void *, void *);
-static void short2uint_sat( void *, void *);
-static void short2int_sat( void *, void *);
-static void short2float_sat( void *, void *);
-static void short2double_sat( void *, void *);
-static void short2ulong_sat( void *, void *);
-static void short2long_sat( void *, void *);
-static void uint2uchar_sat( void *, void *);
-static void uint2char_sat( void *, void *);
-static void uint2ushort_sat( void *, void *);
-static void uint2short_sat( void *, void *);
-static void uint2int_sat( void *, void *);
-static void uint2float_sat( void *, void *);
-static void uint2double_sat( void *, void *);
-static void uint2ulong_sat( void *, void *);
-static void uint2long_sat( void *, void *);
-static void int2uchar_sat( void *, void *);
-static void int2char_sat( void *, void *);
-static void int2ushort_sat( void *, void *);
-static void int2short_sat( void *, void *);
-static void int2uint_sat( void *, void *);
-static void int2float_sat( void *, void *);
-static void int2double_sat( void *, void *);
-static void int2ulong_sat( void *, void *);
-static void int2long_sat( void *, void *);
-static void float2uchar_sat( void *, void *);
-static void float2char_sat( void *, void *);
-static void float2ushort_sat( void *, void *);
-static void float2short_sat( void *, void *);
-static void float2uint_sat( void *, void *);
-static void float2int_sat( void *, void *);
-static void float2double_sat( void *, void *);
-static void float2ulong_sat( void *, void *);
-static void float2long_sat( void *, void *);
-static void double2uchar_sat( void *, void *);
-static void double2char_sat( void *, void *);
-static void double2ushort_sat( void *, void *);
-static void double2short_sat( void *, void *);
-static void double2uint_sat( void *, void *);
-static void double2int_sat( void *, void *);
-static void double2float_sat( void *, void *);
-static void double2ulong_sat( void *, void *);
-static void double2long_sat( void *, void *);
-static void ulong2uchar_sat( void *, void *);
-static void ulong2char_sat( void *, void *);
-static void ulong2ushort_sat( void *, void *);
-static void ulong2short_sat( void *, void *);
-static void ulong2uint_sat( void *, void *);
-static void ulong2int_sat( void *, void *);
-static void ulong2float_sat( void *, void *);
-static void ulong2double_sat( void *, void *);
-static void ulong2long_sat( void *, void *);
-static void long2uchar_sat( void *, void *);
-static void long2char_sat( void *, void *);
-static void long2ushort_sat( void *, void *);
-static void long2short_sat( void *, void *);
-static void long2uint_sat( void *, void *);
-static void long2int_sat( void *, void *);
-static void long2float_sat( void *, void *);
-static void long2double_sat( void *, void *);
-static void long2ulong_sat( void *, void *);
-/*
-    #include <stdio.h>
-
-    const char *names[] = { "uchar", "char", "ushort", "short", "uint", "int", "float", "double", "ulong", "long" };
-
-    int main( void )
-    {
-
-        int i,j;
-
-        for( i = 0; i < sizeof( names ) / sizeof( names[0] ); i++ )
-        {
-            vlog( "{ " );
-            for( j = 0; j < sizeof( names ) / sizeof( names[0] ); j++ )
-            {
-                if( j == i )
-                    vlog( "             NULL, " );
-                else
-                {
-                    char s[64];
-                    sprintf( s, "%s2%s_sat,", names[j], names[i] );
-                    vlog( "%18s ", s );
-                }
-            }
-            vlog( "},\n" );
-        }
-
-        return 0;
-    }
-
-Convert gSaturatedConversions[kTypeCount][kTypeCount] = {
-{              NULL,    char2uchar_sat,  ushort2uchar_sat,   short2uchar_sat,    uint2uchar_sat,     int2uchar_sat,   float2uchar_sat,  double2uchar_sat,   ulong2uchar_sat,    long2uchar_sat, },
-{    uchar2char_sat,              NULL,   ushort2char_sat,    short2char_sat,     uint2char_sat,      int2char_sat,    float2char_sat,   double2char_sat,    ulong2char_sat,     long2char_sat, },
-{  uchar2ushort_sat,   char2ushort_sat,              NULL,  short2ushort_sat,   uint2ushort_sat,    int2ushort_sat,  float2ushort_sat, double2ushort_sat,  ulong2ushort_sat,   long2ushort_sat, },
-{   uchar2short_sat,    char2short_sat,  ushort2short_sat,              NULL,    uint2short_sat,     int2short_sat,   float2short_sat,  double2short_sat,   ulong2short_sat,    long2short_sat, },
-{    uchar2uint_sat,     char2uint_sat,   ushort2uint_sat,    short2uint_sat,              NULL,      int2uint_sat,    float2uint_sat,   double2uint_sat,    ulong2uint_sat,     long2uint_sat, },
-{     uchar2int_sat,      char2int_sat,    ushort2int_sat,     short2int_sat,      uint2int_sat,              NULL,     float2int_sat,    double2int_sat,     ulong2int_sat,      long2int_sat, },
-{   uchar2float_sat,    char2float_sat,  ushort2float_sat,   short2float_sat,    uint2float_sat,     int2float_sat,              NULL,  double2float_sat,   ulong2float_sat,    long2float_sat, },
-{  uchar2double_sat,   char2double_sat, ushort2double_sat,  short2double_sat,   uint2double_sat,    int2double_sat,  float2double_sat,              NULL,  ulong2double_sat,   long2double_sat, },
-{   uchar2ulong_sat,    char2ulong_sat,  ushort2ulong_sat,   short2ulong_sat,    uint2ulong_sat,     int2ulong_sat,   float2ulong_sat,  double2ulong_sat,              NULL,    long2ulong_sat, },
-{    uchar2long_sat,     char2long_sat,   ushort2long_sat,    short2long_sat,     uint2long_sat,      int2long_sat,    float2long_sat,   double2long_sat,    ulong2long_sat,              NULL, }
-};
-*/
-
-/*
-    #include <stdio.h>
-
-    const char *names[] = { "uchar", "char", "ushort", "short", "uint", "int", "float", "double", "ulong", "long" };
-    const char *types[] = { "uchar", "char", "ushort", "short", "uint", "int", "float", "double", "ulong", "llong" };
-
-    int main( void )
-    {
-
-        int i,j;
-
-        for( i = 0; i < sizeof( names ) / sizeof( names[0] ); i++ )
-            for( j = 0; j < sizeof( names ) / sizeof( names[0] ); j++ )
-            {
-                if( j == i )
-                    continue;
-
-                switch( i )
-                {
-                    case 6: //float
-                        if( j == 7 )
-                            vlog( "void %s2%s( void *out, void *in){ ((%s*) out)[0] = (%s) ((%s*) in)[0]; }\n", names[i], names[i], names[j], types[j], types[i] );
-                        else
-                            vlog( "void %s2%s( void *out, void *in){ ((%s*) out)[0] = (%s) my_rintf(((%s*) in)[0]); }\n", names[i], names[i], names[j], types[j], types[i] );
-                        break;
-                    case 7: //double
-                        if( j == 6 )
-                            vlog( "void %s2%s( void *out, void *in){ ((%s*) out)[0] = (%s) ((%s*) in)[0]; }\n", names[i], names[i], names[j], types[j], types[i] );
-                        else
-                            vlog( "void %s2%s( void *out, void *in){ ((%s*) out)[0] = (%s) rint(((%s*) in)[0]); }\n", names[i], names[i], names[j], types[j], types[i] );
-                        break;
-                    default:
-                        vlog( "void %s2%s( void *out, void *in){ ((%s*) out)[0] = (%s)
-                        ((%s*) in)[0]; }\n", names[i], names[i], names[j], types[j], types[i] );
-                        break;
-                }
-            }
-
-
-        return 0;
-    }
-*/
-
-float my_fabsf( float x )
-{
-    union{ cl_uint u; float f; }u;
-    u.f = x;
-    u.u &= 0x7fffffff;
-    return u.f;
-}
-
-double my_fabs( double x )
-{
-    union{ cl_ulong u; double f; }u;
-    u.f = x;
-    u.u &= 0x7fffffffffffffffULL;
-    return u.f;
-}
-
-static float my_rintf( float f );
-static float my_rintf( float f )
-{
-    static const float magic[2] = { MAKE_HEX_FLOAT( 0x1.0p23f, 0x1, 23), - MAKE_HEX_FLOAT( 0x1.0p23f, 0x1, 23) };
-
-    // Round fractional values to integer in round towards nearest mode
-    if( fabsf(f) < MAKE_HEX_FLOAT( 0x1.0p23f, 0x1, 23 ) )
-    {
-        volatile float x = f;
-        float magicVal = magic[ f < 0 ];
-
-#if defined( __SSE__ )
-        // Defeat x87 based arithmetic, which cant do FTZ, and will round this incorrectly
-        __m128 v = _mm_set_ss( x );
-        __m128 m = _mm_set_ss( magicVal );
-        v = _mm_add_ss( v, m );
-        v = _mm_sub_ss( v, m );
-        _mm_store_ss( (float*) &x, v );
-#else
-        x += magicVal;
-        x -= magicVal;
-#endif
-        f = x;
-    }
-
-    return f;
-}
-
-static void uchar2char( void *out, void *in){ ((char*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2short( void *out, void *in){ ((short*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2uint( void *out, void *in){ ((cl_uint*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2int( void *out, void *in){ ((int*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2float( void *out, void *in)
-{
-    cl_uchar l = ((cl_uchar*) in)[0];
-    ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void uchar2double( void *out, void *in)
-{
-    cl_uchar l = ((cl_uchar*) in)[0];
-    ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void uchar2ulong( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2long( void *out, void *in){ ((cl_long*) out)[0] = ((cl_uchar*) in)[0]; }
-static void char2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = ((cl_char*) in)[0]; }
-static void char2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = ((cl_char*) in)[0]; }
-static void char2short( void *out, void *in){ ((short*) out)[0] = ((cl_char*) in)[0]; }
-static void char2uint( void *out, void *in){ ((cl_uint*) out)[0] = ((cl_char*) in)[0]; }
-static void char2int( void *out, void *in){ ((int*) out)[0] = ((cl_char*) in)[0]; }
-static void char2float( void *out, void *in)
-{
-    cl_char l = ((cl_char*) in)[0];
-    ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void char2double( void *out, void *in)
-{
-    cl_char l = ((cl_char*) in)[0];
-    ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void char2ulong( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_char*) in)[0]; }
-static void char2long( void *out, void *in){ ((cl_long*) out)[0] = ((cl_char*) in)[0]; }
-static void ushort2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2char( void *out, void *in){ ((char*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2short( void *out, void *in){ ((short*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2uint( void *out, void *in){ ((cl_uint*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2int( void *out, void *in){ ((int*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2float( void *out, void *in)
-{
-    cl_ushort l = ((cl_ushort*) in)[0];
-    ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void ushort2double( void *out, void *in)
-{
-    cl_ushort l = ((cl_ushort*) in)[0];
-    ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void ushort2ulong( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2long( void *out, void *in){ ((cl_long*) out)[0] = ((cl_ushort*) in)[0]; }
-static void short2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = ((cl_short*) in)[0]; }
-static void short2char( void *out, void *in){ ((cl_char*) out)[0] = ((cl_short*) in)[0]; }
-static void short2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = ((cl_short*) in)[0]; }
-static void short2uint( void *out, void *in){ ((cl_uint*) out)[0] = ((cl_short*) in)[0]; }
-static void short2int( void *out, void *in){ ((cl_int*) out)[0] = ((cl_short*) in)[0]; }
-static void short2float( void *out, void *in)
-{
-    cl_short l = ((cl_short*) in)[0];
-    ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void short2double( void *out, void *in)
-{
-    cl_short l = ((cl_short*) in)[0];
-    ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void short2ulong( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_short*) in)[0]; }
-static void short2long( void *out, void *in){ ((cl_long*) out)[0] = ((cl_short*) in)[0]; }
-static void uint2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = ((cl_uint*) in)[0]; }
-static void uint2char( void *out, void *in){ ((cl_char*) out)[0] = ((cl_uint*) in)[0]; }
-static void uint2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = ((cl_uint*) in)[0]; }
-static void uint2short( void *out, void *in){ ((short*) out)[0] = ((cl_uint*) in)[0]; }
-static void uint2int( void *out, void *in){ ((cl_int*) out)[0] = ((cl_uint*) in)[0]; }
-static void uint2float( void *out, void *in)
-{
-    // Use volatile to prevent optimization by Clang compiler
-    volatile cl_uint l = ((cl_uint *)in)[0];
-    ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void uint2double( void *out, void *in)
-{
-    cl_uint l = ((cl_uint*) in)[0];
-    ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void uint2ulong( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_uint*) in)[0]; }
-static void uint2long( void *out, void *in){ ((cl_long*) out)[0] = ((cl_uint*) in)[0]; }
-static void int2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = ((cl_int*) in)[0]; }
-static void int2char( void *out, void *in){ ((cl_char*) out)[0] = ((cl_int*) in)[0]; }
-static void int2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = ((cl_int*) in)[0]; }
-static void int2short( void *out, void *in){ ((cl_short*) out)[0] = ((cl_int*) in)[0]; }
-static void int2uint( void *out, void *in){ ((cl_uint*) out)[0] = ((cl_int*) in)[0]; }
-static void int2float( void *out, void *in)
-{
-    // Use volatile to prevent optimization by Clang compiler
-    volatile cl_int l = ((cl_int *)in)[0];
-    ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void int2double( void *out, void *in)
-{
-    cl_int l = ((cl_int*) in)[0];
-    ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void int2ulong( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_int*) in)[0]; }
-static void int2long( void *out, void *in){ ((cl_long*) out)[0] = ((cl_int*) in)[0]; }
-static void float2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = my_rintf(((cl_float*) in)[0]); }
-static void float2char( void *out, void *in){ ((cl_char*) out)[0] = my_rintf(((cl_float*) in)[0]); }
-static void float2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = my_rintf(((cl_float*) in)[0]); }
-static void float2short( void *out, void *in){ ((cl_short*) out)[0] = my_rintf(((cl_float*) in)[0]); }
-static void float2uint( void *out, void *in){ ((cl_uint*) out)[0] = my_rintf(((cl_float*) in)[0]); }
-static void float2int( void *out, void *in){ ((cl_int*) out)[0] = my_rintf(((cl_float*) in)[0]); }
-static void float2double( void *out, void *in){ ((cl_double*) out)[0] = ((cl_float*) in)[0]; }
-static void float2ulong( void *out, void *in)
-{
-#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
-    // VS2005 (at least) on x86 uses fistp to store the float as a 64-bit int.
-    // However, fistp stores it as a signed int, and some of the test values won't
-    // fit into a signed int. (These test values are >= 2^63.) The result on VS2005
-    // is that these end up silently (at least by default settings) clamped to
-    // the max lowest ulong.
-    cl_float x = my_rintf(((cl_float *)in)[0]);
-    if (x >= 9223372036854775808.0f) {
-        x -= 9223372036854775808.0f;
-        ((cl_ulong*) out)[0] = x;
-        ((cl_ulong*) out)[0] += 9223372036854775808ULL;
-    } else {
-        ((cl_ulong*) out)[0] = x;
-    }
-#else
-    ((cl_ulong*) out)[0] = my_rintf(((cl_float*) in)[0]);
-#endif
-}
-
-static void float2long( void *out, void *in){ ((cl_long*) out)[0] =  llrint_clamped( ((cl_float*) in)[0] ); }
-static void double2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = rint(((cl_double*) in)[0]); }
-static void double2char( void *out, void *in){ ((cl_char*) out)[0] = rint(((cl_double*) in)[0]); }
-static void double2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = rint(((cl_double*) in)[0]); }
-static void double2short( void *out, void *in){ ((cl_short*) out)[0] = rint(((cl_double*) in)[0]); }
-static void double2uint( void *out, void *in){ ((cl_uint*) out)[0] = (cl_uint) rint(((cl_double*) in)[0]); }
-static void double2int( void *out, void *in){ ((cl_int*) out)[0] = (int) rint(((cl_double*) in)[0]); }
-static void double2float( void *out, void *in){ ((cl_float*) out)[0] = (float) ((cl_double*) in)[0]; }
-static void double2ulong( void *out, void *in){ ((cl_ulong*) out)[0] = (cl_ulong) rint(((cl_double*) in)[0]); }
-static void double2long( void *out, void *in){ ((cl_long*) out)[0] = (cl_long) rint(((cl_double*) in)[0]); }
-static void ulong2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = (cl_uchar) ((cl_ulong*) in)[0]; }
-static void ulong2char( void *out, void *in){ ((cl_char*) out)[0] = (cl_char) ((cl_ulong*) in)[0]; }
-static void ulong2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = (cl_ushort) ((cl_ulong*) in)[0]; }
-static void ulong2short( void *out, void *in){ ((cl_short*) out)[0] = (cl_short)((cl_ulong*) in)[0]; }
-static void ulong2uint( void *out, void *in){ ((cl_uint*) out)[0] = (cl_uint) ((cl_ulong*) in)[0]; }
-static void ulong2int( void *out, void *in){ ((cl_int*) out)[0] = (cl_int) ((cl_ulong*) in)[0]; }
-static void ulong2float( void *out, void *in)
-{
-#if defined(_MSC_VER) && defined(_M_X64)
-    cl_ulong l = ((cl_ulong*) in)[0];
-    float result;
-    cl_long sl = ((cl_long)l < 0) ? (cl_long)((l >> 1) | (l & 1)) : (cl_long)l;
-    _mm_store_ss(&result, _mm_cvtsi64_ss(_mm_setzero_ps(), sl));
-    ((float*) out)[0] = (l == 0 ? 0.0f : (((cl_long)l < 0) ? result * 2.0f : result));
-#else
-    cl_ulong l = ((cl_ulong*) in)[0];
-#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
-    /* ARM VFP doesn't have hardware instruction for converting from 64-bit
-     * integer to float types, hence GCC ARM uses the floating-point emulation
-     * code despite which -mfloat-abi setting it is. But the emulation code in
-     * libgcc.a has only one rounding mode (round to nearest even in this case)
-     * and ignores the user rounding mode setting in hardware.
-     * As a result setting rounding modes in hardware won't give correct
-     * rounding results for type covert from 64-bit integer to float using GCC
-     * for ARM compiler so for testing different rounding modes, we need to use
-     * alternative reference function. ARM64 does have an instruction, however
-     * we cannot guarantee the compiler will use it.  On all ARM architechures
-     * use emulation to calculate reference.*/
-    ((float*) out)[0] = qcom_u64_2_f32(l, qcom_sat, qcom_rm);
-#else
-    ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-#endif
-#endif
-}
-static void ulong2double( void *out, void *in)
-{
-#if defined(_MSC_VER)
-    cl_ulong l = ((cl_ulong*) in)[0];
-    double result;
-
-    cl_long sl = ((cl_long)l < 0) ? (cl_long)((l >> 1) | (l & 1)) : (cl_long)l;
-#if defined(_M_X64)
-    _mm_store_sd(&result, _mm_cvtsi64_sd(_mm_setzero_pd(), sl));
-#else
-    result = sl;
-#endif
-    ((double*) out)[0] = (l == 0 ? 0.0 : (((cl_long)l < 0) ? result * 2.0 : result));
-#else
-    // Use volatile to prevent optimization by Clang compiler
-    volatile cl_ulong l = ((cl_ulong *)in)[0];
-    ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-#endif
-}
-static void ulong2long( void *out, void *in){ ((cl_long*) out)[0] = ((cl_ulong*) in)[0]; }
-static void long2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = (cl_uchar) ((cl_long*) in)[0]; }
-static void long2char( void *out, void *in){ ((cl_char*) out)[0] = (cl_char) ((cl_long*) in)[0]; }
-static void long2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = (cl_ushort) ((cl_long*) in)[0]; }
-static void long2short( void *out, void *in){ ((cl_short*) out)[0] = (cl_short) ((cl_long*) in)[0]; }
-static void long2uint( void *out, void *in){ ((cl_uint*) out)[0] = (cl_uint) ((cl_long*) in)[0]; }
-static void long2int( void *out, void *in){ ((cl_int*) out)[0] = (cl_int) ((cl_long*) in)[0]; }
-static void long2float( void *out, void *in)
-{
-#if defined(_MSC_VER) && defined(_M_X64)
-    cl_long l = ((cl_long*) in)[0];
-    float result;
-
-    _mm_store_ss(&result, _mm_cvtsi64_ss(_mm_setzero_ps(), l));
-    ((float*) out)[0] = (l == 0 ? 0.0f : result);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-#else
-    cl_long l = ((cl_long*) in)[0];
-#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
-    /* ARM VFP doesn't have hardware instruction for converting from 64-bit
-     * integer to float types, hence GCC ARM uses the floating-point emulation
-     * code despite which -mfloat-abi setting it is. But the emulation code in
-     * libgcc.a has only one rounding mode (round to nearest even in this case)
-     * and ignores the user rounding mode setting in hardware.
-     * As a result setting rounding modes in hardware won't give correct
-     * rounding results for type covert from 64-bit integer to float using GCC
-     * for ARM compiler so for testing different rounding modes, we need to use
-     * alternative reference function. ARM64 does have an instruction, however
-     * we cannot guarantee the compiler will use it.  On all ARM architechures
-     * use emulation to calculate reference.*/
-    ((float*) out)[0] = (l == 0 ? 0.0f : qcom_s64_2_f32(l, qcom_sat, qcom_rm));
-#else
-    ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-#endif
-#endif
-}
-static void long2double( void *out, void *in)
-{
-#if defined(_MSC_VER) && defined(_M_X64)
-    cl_long l = ((cl_long*) in)[0];
-    double result;
-
-    _mm_store_sd(&result, _mm_cvtsi64_sd(_mm_setzero_pd(), l));
-    ((double*) out)[0] = (l == 0 ? 0.0 : result);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-#else
-    cl_long l = ((cl_long*) in)[0];
-    ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-#endif
-}
-static void long2ulong( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_long*) in)[0]; }
-
-#define CLAMP( _lo, _x, _hi )   ( (_x) < (_lo) ? (_lo) : ((_x) > (_hi) ? (_hi) : (_x)))
-
-// Done by hand
-static void uchar2char_sat( void *out, void *in){ cl_uchar c = ((cl_uchar*) in)[0]; ((cl_char*) out)[0] = c > 0x7f ? 0x7f : c; }
-static void uchar2ushort_sat( void *out, void *in){ ((cl_ushort*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2short_sat( void *out, void *in){ ((cl_short*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2uint_sat( void *out, void *in){ ((cl_uint*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2int_sat( void *out, void *in){ ((cl_int*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2float_sat( void *out, void *in){ ((cl_float*) out)[0] = my_fabsf( (cl_float) ((cl_uchar*) in)[0]); } // my_fabs workaround for <rdar://problem/5965527>
-static void uchar2double_sat( void *out, void *in){ ((cl_double*) out)[0] = my_fabs( (cl_double) ((cl_uchar*) in)[0]); } // my_fabs workaround for <rdar://problem/5965527>
-static void uchar2ulong_sat( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2long_sat( void *out, void *in){ ((cl_long*) out)[0] = ((cl_uchar*) in)[0]; }
-static void char2uchar_sat( void *out, void *in){ cl_char c = ((cl_char*) in)[0]; ((cl_uchar*) out)[0] = c < 0 ? 0 : c; }
-static void char2ushort_sat( void *out, void *in){ cl_char c = ((cl_char*) in)[0]; ((cl_ushort*) out)[0] = c < 0 ? 0 : c; }
-static void char2short_sat( void *out, void *in){ ((cl_short*) out)[0] = ((cl_char*) in)[0]; }
-static void char2uint_sat( void *out, void *in){ cl_char c = ((cl_char*) in)[0]; ((cl_uint*) out)[0] = c < 0 ? 0 : c; }
-static void char2int_sat( void *out, void *in){ ((cl_int*) out)[0] = ((cl_char*) in)[0]; }
-static void char2float_sat( void *out, void *in){ ((cl_float*) out)[0] = ((cl_char*) in)[0]; }
-static void char2double_sat( void *out, void *in){ ((cl_double*) out)[0] = ((cl_char*) in)[0]; }
-static void char2ulong_sat( void *out, void *in){ cl_char c = ((cl_char*) in)[0]; ((cl_ulong*) out)[0] = c < 0 ? 0 : c; }
-static void char2long_sat( void *out, void *in){ ((cl_long*) out)[0] = ((cl_char*) in)[0]; }
-static void ushort2uchar_sat( void *out, void *in){ cl_ushort u = ((cl_ushort*) in)[0]; ((cl_uchar*) out)[0] = u > 0xff ? 0xFF : u; }
-static void ushort2char_sat( void *out, void *in){ cl_ushort u = ((cl_ushort*) in)[0]; ((cl_char*) out)[0] = u > 0x7f ? 0x7F : u; }
-static void ushort2short_sat( void *out, void *in){ cl_ushort u = ((cl_ushort*) in)[0]; ((cl_short*) out)[0] = u > 0x7fff ? 0x7fFF : u; }
-static void ushort2uint_sat( void *out, void *in){ ((cl_uint*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2int_sat( void *out, void *in){ ((cl_int*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2float_sat( void *out, void *in){ ((cl_float*) out)[0] = my_fabsf((cl_float)((cl_ushort*) in)[0]); }     // my_fabs workaround for <rdar://problem/5965527>
-static void ushort2double_sat( void *out, void *in){ ((cl_double*) out)[0] = my_fabs( (cl_double) ((cl_ushort*) in)[0]); } // my_fabs workaround for <rdar://problem/5965527>
-static void ushort2ulong_sat( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2long_sat( void *out, void *in){ ((cl_long*) out)[0] = ((cl_ushort*) in)[0]; }
-static void short2uchar_sat( void *out, void *in){ cl_short s = ((cl_short*) in)[0]; ((cl_uchar*) out)[0] = CLAMP( 0, s, CL_UCHAR_MAX ); }
-static void short2char_sat( void *out, void *in){ cl_short s = ((cl_short*) in)[0]; ((cl_char*) out)[0] = CLAMP( CL_CHAR_MIN, s, CL_CHAR_MAX ); }
-static void short2ushort_sat( void *out, void *in){ cl_short s = ((cl_short*) in)[0]; ((cl_ushort*) out)[0] = s < 0 ? 0 : s; }
-static void short2uint_sat( void *out, void *in){ cl_short s = ((cl_short*) in)[0]; ((cl_uint*) out)[0] = s < 0 ? 0 : s; }
-static void short2int_sat( void *out, void *in){ ((cl_int*) out)[0] = ((cl_short*) in)[0]; }
-static void short2float_sat( void *out, void *in){ ((cl_float*) out)[0] = ((cl_short*) in)[0]; }
-static void short2double_sat( void *out, void *in){ ((cl_double*) out)[0] = ((cl_short*) in)[0]; }
-static void short2ulong_sat( void *out, void *in){ cl_short s = ((cl_short*) in)[0]; ((cl_ulong*) out)[0] = s < 0 ? 0 : s; }
-static void short2long_sat( void *out, void *in){ ((cl_long*) out)[0] = ((cl_short*) in)[0]; }
-static void uint2uchar_sat( void *out, void *in){ cl_uint u = ((cl_uint*) in)[0]; ((cl_uchar*) out)[0] = CLAMP( 0, u, CL_UCHAR_MAX); }
-static void uint2char_sat( void *out, void *in){  cl_uint u = ((cl_uint*) in)[0]; ((cl_char*) out)[0] = CLAMP( 0, u, CL_CHAR_MAX ); }
-static void uint2ushort_sat( void *out, void *in){  cl_uint u = ((cl_uint*) in)[0]; ((cl_ushort*) out)[0] = CLAMP( 0, u, CL_USHRT_MAX); }
-static void uint2short_sat( void *out, void *in){  cl_uint u = ((cl_uint*) in)[0]; ((cl_short*) out)[0] = CLAMP( 0, u, CL_SHRT_MAX); }
-static void uint2int_sat( void *out, void *in){  cl_uint u = ((cl_uint*) in)[0]; ((cl_int*) out)[0] = CLAMP( 0, u, CL_INT_MAX); }
-static void uint2float_sat( void *out, void *in){ ((cl_float*) out)[0] = my_fabsf( (cl_float) ((cl_uint*) in)[0] ); }  // my_fabs workaround for <rdar://problem/5965527>
-static void uint2double_sat( void *out, void *in){ ((cl_double*) out)[0] = my_fabs( (cl_double) ((cl_uint*) in)[0]); } // my_fabs workaround for <rdar://problem/5965527>
-static void uint2ulong_sat( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_uint*) in)[0]; }
-static void uint2long_sat( void *out, void *in){ ((cl_long*) out)[0] = ((cl_uint*) in)[0]; }
-static void int2uchar_sat( void *out, void *in){ cl_int i = ((cl_int*) in)[0]; ((cl_uchar*) out)[0] = CLAMP( 0, i, CL_UCHAR_MAX); }
-static void int2char_sat( void *out, void *in){ cl_int i = ((cl_int*) in)[0]; ((cl_char*) out)[0] = CLAMP( CL_CHAR_MIN, i, CL_CHAR_MAX); }
-static void int2ushort_sat( void *out, void *in){ cl_int i = ((cl_int*) in)[0]; ((cl_ushort*) out)[0] = CLAMP( 0, i, CL_USHRT_MAX); }
-static void int2short_sat( void *out, void *in){ cl_int i = ((cl_int*) in)[0]; ((cl_short*) out)[0] = CLAMP( CL_SHRT_MIN, i, CL_SHRT_MAX); }
-static void int2uint_sat( void *out, void *in){ cl_int i = ((cl_int*) in)[0]; ((cl_uint*) out)[0] = CLAMP( 0, i, CL_INT_MAX); }
-static void int2float_sat( void *out, void *in){ ((cl_float*) out)[0] = ((cl_int*) in)[0]; }
-static void int2double_sat( void *out, void *in){ ((cl_double*) out)[0] = ((cl_int*) in)[0]; }
-static void int2ulong_sat( void *out, void *in){ cl_int i = ((cl_int*) in)[0]; ((cl_ulong*) out)[0] = i < 0 ? 0 : i; }
-static void int2long_sat( void *out, void *in){ ((cl_long*) out)[0] = ((cl_int*) in)[0]; }
-static void float2uchar_sat( void *out, void *in){ ((cl_uchar*) out)[0] = CLAMP( 0, lrintf_clamped(((cl_float*) in)[0]), CL_UCHAR_MAX ); }
-static void float2char_sat( void *out, void *in){ ((cl_char*) out)[0] = CLAMP( CL_CHAR_MIN, lrintf_clamped(((cl_float*) in)[0]), CL_CHAR_MAX); }
-static void float2ushort_sat( void *out, void *in){ ((cl_ushort*) out)[0] = CLAMP( 0, lrintf_clamped(((cl_float*) in)[0]), CL_USHRT_MAX ); }
-static void float2short_sat( void *out, void *in){ ((cl_short*) out)[0] = CLAMP( CL_SHRT_MIN, lrintf_clamped(((cl_float*) in)[0]), CL_SHRT_MAX ); }
-static void float2uint_sat( void *out, void *in){ ((cl_uint*) out)[0] = (cl_uint) CLAMP( 0, llrintf_clamped(((cl_float*) in)[0]), CL_UINT_MAX ); }
-static void float2int_sat( void *out, void *in){ ((cl_int*) out)[0] = (cl_int) CLAMP( CL_INT_MIN, lrintf_clamped(((cl_float*) in)[0]), CL_INT_MAX ); }
-static void float2double_sat( void *out, void *in){ ((cl_double*) out)[0] = ((cl_float*) in)[0]; }
-static void float2ulong_sat( void *out, void *in)
-{
-#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
-    // VS2005 (at least) on x86 uses fistp to store the float as a 64-bit int.
-    // However, fistp stores it as a signed int, and some of the test values won't
-    // fit into a signed int. (These test values are >= 2^63.) The result on VS2005
-    // is that these end up silently (at least by default settings) clamped to
-    // the max lowest ulong.
-    cl_float x = my_rintf(((cl_float *)in)[0]);
-    if (x >= 18446744073709551616.0f) {         // 2^64
-        ((cl_ulong*) out)[0] = 0xFFFFFFFFFFFFFFFFULL;
-    } else if (x < 0) {
-        ((cl_ulong*) out)[0] = 0;
-    } else if (x >= 9223372036854775808.0f) {   // 2^63
-        x -= 9223372036854775808.0f;
-        ((cl_ulong*) out)[0] = x;
-        ((cl_ulong*) out)[0] += 9223372036854775808ULL;
-    } else {
-        ((cl_ulong*) out)[0] = x;
-    }
-#else
-    float f = my_rintf(((float*) in)[0]); ((cl_ulong*) out)[0] = f >= MAKE_HEX_DOUBLE(0x1.0p64, 0x1LL, 64) ? 0xFFFFFFFFFFFFFFFFULL : f < 0 ? 0 : (cl_ulong) f;
-#endif
-}
-// The final cast used to be (cl_ulong) f, but on Linux (RHEL5 at least)
-// if f = -1.0f, then (cl_ulong) f = 0xffffffff, which clearly isn't right.
-// Switching it to (cl_long) f seems to fix that.
-static void float2long_sat( void *out, void *in){ float f = my_rintf(((float*) in)[0]); ((cl_long*) out)[0] = f >= MAKE_HEX_DOUBLE(0x1.0p63, 0x1LL, 63) ? 0x7FFFFFFFFFFFFFFFULL : f < MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63) ? 0x8000000000000000LL : (cl_long) f; }
-static void double2uchar_sat( void *out, void *in){ ((cl_uchar*) out)[0] = CLAMP( 0, lrint_clamped(((cl_double*) in)[0]), CL_UCHAR_MAX ); }
-static void double2char_sat( void *out, void *in){ ((cl_char*) out)[0] = CLAMP( CL_CHAR_MIN, lrint_clamped(((cl_double*) in)[0]), CL_CHAR_MAX); }
-static void double2ushort_sat( void *out, void *in){ ((cl_ushort*) out)[0] = CLAMP( 0, lrint_clamped(((cl_double*) in)[0]), CL_USHRT_MAX ); }
-static void double2short_sat( void *out, void *in){ ((cl_short*) out)[0] = CLAMP( CL_SHRT_MIN, lrint_clamped(((cl_double*) in)[0]), CL_SHRT_MAX ); }
-static void double2uint_sat( void *out, void *in){ ((cl_uint*) out)[0] = (cl_uint) CLAMP( 0, llrint_clamped(((cl_double*) in)[0]), CL_UINT_MAX ); }
-static void double2int_sat( void *out, void *in){ ((cl_int*) out)[0] = (cl_int) CLAMP( CL_INT_MIN, lrint_clamped(((cl_double*) in)[0]), CL_INT_MAX ); }
-static void double2float_sat( void *out, void *in){ ((cl_float*) out)[0] = (cl_float) ((double*) in)[0]; }
-static void double2ulong_sat( void *out, void *in){ double f = rint(((double*) in)[0]); ((cl_ulong*) out)[0] = f >= MAKE_HEX_DOUBLE(0x1.0p64, 0x1LL, 64) ? 0xFFFFFFFFFFFFFFFFULL : f < 0 ? 0 : (cl_ulong) f; }
-static void double2long_sat( void *out, void *in){ double f = rint(((double*) in)[0]); ((cl_long*) out)[0] = f >= MAKE_HEX_DOUBLE(0x1.0p63, 0x1LL, 63) ? 0x7FFFFFFFFFFFFFFFULL : f < MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63) ? 0x8000000000000000LL : (cl_long) f; }
-static void ulong2uchar_sat( void *out, void *in){ cl_ulong u = ((cl_ulong*) in)[0]; ((cl_uchar*) out)[0] = CLAMP( 0, u, CL_UCHAR_MAX ); }
-static void ulong2char_sat( void *out, void *in){ cl_ulong u = ((cl_ulong*) in)[0]; ((cl_char*) out)[0] = CLAMP( 0, u, CL_CHAR_MAX ); }
-static void ulong2ushort_sat( void *out, void *in){ cl_ulong u = ((cl_ulong*) in)[0]; ((cl_ushort*) out)[0] = CLAMP( 0, u, CL_USHRT_MAX ); }
-static void ulong2short_sat( void *out, void *in){ cl_ulong u = ((cl_ulong*) in)[0]; ((cl_short*) out)[0] = CLAMP( 0, u, CL_SHRT_MAX ); }
-static void ulong2uint_sat( void *out, void *in){ cl_ulong u = ((cl_ulong*) in)[0]; ((cl_uint*) out)[0] = (cl_uint) CLAMP( 0, u, CL_UINT_MAX ); }
-static void ulong2int_sat( void *out, void *in){ cl_ulong u = ((cl_ulong*) in)[0]; ((cl_int*) out)[0] = (cl_int) CLAMP( 0, u, CL_INT_MAX ); }
-static void ulong2float_sat( void *out, void *in){ ((float*) out)[0] = my_fabsf((float) ((cl_ulong*) in)[0]); }  // my_fabs workaround for <rdar://problem/5965527>
-static void ulong2double_sat( void *out, void *in){ ((double*) out)[0] = my_fabs( ((cl_ulong*) in)[0]); }        // my_fabs workaround for <rdar://problem/5965527>
-static void ulong2long_sat( void *out, void *in){ cl_ulong u = ((cl_ulong*) in)[0]; ((cl_long*) out)[0] = CLAMP( 0, u, CL_LONG_MAX ); }
-static void long2uchar_sat( void *out, void *in){ cl_long u = ((cl_long*) in)[0]; ((cl_uchar*) out)[0] = CLAMP( 0, u, CL_UCHAR_MAX ); }
-static void long2char_sat( void *out, void *in){ cl_long u = ((cl_long*) in)[0]; ((cl_char*) out)[0] = CLAMP( CL_CHAR_MIN, u, CL_CHAR_MAX ); }
-static void long2ushort_sat( void *out, void *in){ cl_long u = ((cl_long*) in)[0]; ((cl_ushort*) out)[0] = CLAMP( 0, u, CL_USHRT_MAX ); }
-static void long2short_sat( void *out, void *in){ cl_long u = ((cl_long*) in)[0]; ((cl_short*) out)[0] = CLAMP( CL_SHRT_MIN, u, CL_SHRT_MAX ); }
-static void long2uint_sat( void *out, void *in){ cl_long u = ((cl_long*) in)[0]; ((cl_uint*) out)[0] = (cl_uint) CLAMP( 0, u, CL_UINT_MAX ); }
-static void long2int_sat( void *out, void *in){ cl_long u = ((cl_long*) in)[0]; ((cl_int*) out)[0] = (int) CLAMP( CL_INT_MIN, u, CL_INT_MAX ); }
-static void long2float_sat( void *out, void *in){ ((float*) out)[0] = (float) ((cl_long*) in)[0]; }
-static void long2double_sat( void *out, void *in){ ((double*) out)[0] = ((cl_long*) in)[0]; }
-static void long2ulong_sat( void *out, void *in){ cl_long u = ((cl_long*) in)[0]; ((cl_ulong*) out)[0] = CLAMP( 0, u, CL_LONG_MAX ); }
-
-/*
-#include <stdio.h>
-
-char *ground[] = {   "",
-                                                            "_rte",
-                                                            "_rtp",
-                                                            "_rtn",
-                                                            "_rtz"
-                    };
-
-const char *gTypeNames[  ] = {
-                                            "uchar", "char",
-                                            "ushort", "short",
-                                            "uint",   "int",
-                                            "float", "double",
-                                            "ulong", "long"
-                                        };
-
-
-int main( void )
-{
-    int i, j;
-
-    for( i = 0; i < sizeof( gTypeNames ) / sizeof( gTypeNames[0] ); i++ )
-        for( j = 0; j < sizeof( ground ) / sizeof( ground[0] ); j++ )
-        {
-            vlog( "float clampf_%s%s( float );\n", gTypeNames[i], ground[j] );
-            vlog( "double clampd_%s%s( double );\n", gTypeNames[i], ground[j] );
-        }
-
-    return 0;
-
-}
-*/
-
-
-float clampf_uchar( float );
-double clampd_uchar( double );
-float clampf_uchar_rte( float );
-double clampd_uchar_rte( double );
-float clampf_uchar_rtp( float );
-double clampd_uchar_rtp( double );
-float clampf_uchar_rtn( float );
-double clampd_uchar_rtn( double );
-float clampf_uchar_rtz( float );
-double clampd_uchar_rtz( double );
-float clampf_char( float );
-double clampd_char( double );
-float clampf_char_rte( float );
-double clampd_char_rte( double );
-float clampf_char_rtp( float );
-double clampd_char_rtp( double );
-float clampf_char_rtn( float );
-double clampd_char_rtn( double );
-float clampf_char_rtz( float );
-double clampd_char_rtz( double );
-float clampf_ushort( float );
-double clampd_ushort( double );
-float clampf_ushort_rte( float );
-double clampd_ushort_rte( double );
-float clampf_ushort_rtp( float );
-double clampd_ushort_rtp( double );
-float clampf_ushort_rtn( float );
-double clampd_ushort_rtn( double );
-float clampf_ushort_rtz( float );
-double clampd_ushort_rtz( double );
-float clampf_short( float );
-double clampd_short( double );
-float clampf_short_rte( float );
-double clampd_short_rte( double );
-float clampf_short_rtp( float );
-double clampd_short_rtp( double );
-float clampf_short_rtn( float );
-double clampd_short_rtn( double );
-float clampf_short_rtz( float );
-double clampd_short_rtz( double );
-float clampf_uint( float );
-double clampd_uint( double );
-float clampf_uint_rte( float );
-double clampd_uint_rte( double );
-float clampf_uint_rtp( float );
-double clampd_uint_rtp( double );
-float clampf_uint_rtn( float );
-double clampd_uint_rtn( double );
-float clampf_uint_rtz( float );
-double clampd_uint_rtz( double );
-float clampf_int( float );
-double clampd_int( double );
-float clampf_int_rte( float );
-double clampd_int_rte( double );
-float clampf_int_rtp( float );
-double clampd_int_rtp( double );
-float clampf_int_rtn( float );
-double clampd_int_rtn( double );
-float clampf_int_rtz( float );
-double clampd_int_rtz( double );
-float clampf_float( float );
-double clampd_float( double );
-float clampf_float_rte( float );
-double clampd_float_rte( double );
-float clampf_float_rtp( float );
-double clampd_float_rtp( double );
-float clampf_float_rtn( float );
-double clampd_float_rtn( double );
-float clampf_float_rtz( float );
-double clampd_float_rtz( double );
-float clampf_double( float );
-double clampd_double( double );
-float clampf_double_rte( float );
-double clampd_double_rte( double );
-float clampf_double_rtp( float );
-double clampd_double_rtp( double );
-float clampf_double_rtn( float );
-double clampd_double_rtn( double );
-float clampf_double_rtz( float );
-double clampd_double_rtz( double );
-float clampf_ulong( float );
-double clampd_ulong( double );
-float clampf_ulong_rte( float );
-double clampd_ulong_rte( double );
-float clampf_ulong_rtp( float );
-double clampd_ulong_rtp( double );
-float clampf_ulong_rtn( float );
-double clampd_ulong_rtn( double );
-float clampf_ulong_rtz( float );
-double clampd_ulong_rtz( double );
-float clampf_long( float );
-double clampd_long( double );
-float clampf_long_rte( float );
-double clampd_long_rte( double );
-float clampf_long_rtp( float );
-double clampd_long_rtp( double );
-float clampf_long_rtn( float );
-double clampd_long_rtn( double );
-float clampf_long_rtz( float );
-double clampd_long_rtz( double );
-
-/*
-#include <stdio.h>
-
-char *ground[] = {   "",
-                                                            "_rte",
-                                                            "_rtp",
-                                                            "_rtn",
-                                                            "_rtz"
-                    };
-
-const char *gTypeNames[  ] = {
-                                            "uchar", "char",
-                                            "ushort", "short",
-                                            "uint",   "int",
-                                            "float", "double",
-                                            "ulong", "long"
-                                        };
-
-
-int main( void )
-{
-    int i, j;
-
-    for( i = 0; i < sizeof( gTypeNames ) / sizeof( gTypeNames[0] ); i++ )
-    {
-        vlog( "{\t" );
-        for( j = 0; j < sizeof( ground ) / sizeof( ground[0] ); j++ )
-            vlog( "clampf_%s%s,\t", gTypeNames[i], ground[j] );
-
-        vlog( "\t},\n" );
-    }
-
-    return 0;
-
-}
-*/
-clampf gClampFloat[ kTypeCount ][kRoundingModeCount] = {
-    {    clampf_uchar,    clampf_uchar_rte,    clampf_uchar_rtp,    clampf_uchar_rtn,    clampf_uchar_rtz,        },
-    {    clampf_char,    clampf_char_rte,    clampf_char_rtp,    clampf_char_rtn,    clampf_char_rtz,        },
-    {    clampf_ushort,    clampf_ushort_rte,    clampf_ushort_rtp,    clampf_ushort_rtn,    clampf_ushort_rtz,        },
-    {    clampf_short,    clampf_short_rte,    clampf_short_rtp,    clampf_short_rtn,    clampf_short_rtz,        },
-    {    clampf_uint,    clampf_uint_rte,    clampf_uint_rtp,    clampf_uint_rtn,    clampf_uint_rtz,        },
-    {    clampf_int,     clampf_int_rte,     clampf_int_rtp,     clampf_int_rtn,     clampf_int_rtz,         },
-    {    clampf_float,    clampf_float_rte,    clampf_float_rtp,    clampf_float_rtn,    clampf_float_rtz,        },
-    {    clampf_double,    clampf_double_rte,    clampf_double_rtp,    clampf_double_rtn,    clampf_double_rtz,        },
-    {    clampf_ulong,    clampf_ulong_rte,    clampf_ulong_rtp,    clampf_ulong_rtn,    clampf_ulong_rtz,        },
-    {    clampf_long,    clampf_long_rte,    clampf_long_rtp,    clampf_long_rtn,    clampf_long_rtz,        }
+std::vector<unsigned int> DataInitInfo::specialValuesUInt = {
+      uint32_t(INT_MIN), uint32_t(INT_MIN + 1), uint32_t(INT_MIN + 2),
+      uint32_t(-(1 << 30) - 3), uint32_t(-(1 << 30) - 2), uint32_t(-(1 << 30) - 1), uint32_t(-(1 << 30)),
+      uint32_t(-(1 << 30) + 1), uint32_t(-(1 << 30) + 2), uint32_t(-(1 << 30) + 3),
+      uint32_t(-(1 << 24) - 3), uint32_t(-(1 << 24) - 2),uint32_t(-(1 << 24) - 1),
+      uint32_t(-(1 << 24)), uint32_t(-(1 << 24) + 1), uint32_t(-(1 << 24) + 2), uint32_t(-(1 << 24) + 3),
+      uint32_t(-(1 << 23) - 3), uint32_t(-(1 << 23) - 2),uint32_t(-(1 << 23) - 1),
+      uint32_t(-(1 << 23)), uint32_t(-(1 << 23) + 1), uint32_t(-(1 << 23) + 2), uint32_t(-(1 << 23) + 3),
+      uint32_t(-(1 << 22) - 3), uint32_t(-(1 << 22) - 2),uint32_t(-(1 << 22) - 1),
+      uint32_t(-(1 << 22)), uint32_t(-(1 << 22) + 1), uint32_t(-(1 << 22) + 2), uint32_t(-(1 << 22) + 3),
+      uint32_t(-(1 << 21) - 3), uint32_t(-(1 << 21) - 2),uint32_t(-(1 << 21) - 1),
+      uint32_t(-(1 << 21)), uint32_t(-(1 << 21) + 1), uint32_t(-(1 << 21) + 2), uint32_t(-(1 << 21) + 3),
+      uint32_t(-(1 << 16) - 3), uint32_t(-(1 << 16) - 2),uint32_t(-(1 << 16) - 1),
+      uint32_t(-(1 << 16)), uint32_t(-(1 << 16) + 1), uint32_t(-(1 << 16) + 2), uint32_t(-(1 << 16) + 3),
+      uint32_t(-(1 << 15) - 3), uint32_t(-(1 << 15) - 2),uint32_t(-(1 << 15) - 1),
+      uint32_t(-(1 << 15)), uint32_t(-(1 << 15) + 1), uint32_t(-(1 << 15) + 2), uint32_t(-(1 << 15) + 3),
+      uint32_t(-(1 << 8) - 3), uint32_t(-(1 << 8) - 2),uint32_t(-(1 << 8) - 1),
+      uint32_t(-(1 << 8)), uint32_t(-(1 << 8) + 1), uint32_t(-(1 << 8) + 2), uint32_t(-(1 << 8) + 3),
+      uint32_t(-(1 << 7) - 3), uint32_t(-(1 << 7) - 2),uint32_t(-(1 << 7) - 1),
+      uint32_t(-(1 << 7)), uint32_t(-(1 << 7) + 1), uint32_t(-(1 << 7) + 2), uint32_t(-(1 << 7) + 3),
+      uint32_t(-4), uint32_t(-3), uint32_t(-2), uint32_t(-1), 0, 1, 2, 3, 4,
+      (1 << 7) - 3,(1 << 7) - 2,(1 << 7) - 1, (1 << 7), (1 << 7) + 1, (1 << 7) + 2, (1 << 7) + 3,
+      (1 << 8) - 3,(1 << 8) - 2,(1 << 8) - 1, (1 << 8), (1 << 8) + 1, (1 << 8) + 2, (1 << 8) + 3,
+      (1 << 15) - 3,(1 << 15) - 2,(1 << 15) - 1, (1 << 15), (1 << 15) + 1, (1 << 15) + 2, (1 << 15) + 3,
+      (1 << 16) - 3,(1 << 16) - 2,(1 << 16) - 1, (1 << 16), (1 << 16) + 1, (1 << 16) + 2, (1 << 16) + 3,
+      (1 << 21) - 3,(1 << 21) - 2,(1 << 21) - 1, (1 << 21), (1 << 21) + 1, (1 << 21) + 2, (1 << 21) + 3,
+      (1 << 22) - 3,(1 << 22) - 2,(1 << 22) - 1, (1 << 22), (1 << 22) + 1, (1 << 22) + 2, (1 << 22) + 3,
+      (1 << 23) - 3,(1 << 23) - 2,(1 << 23) - 1, (1 << 23), (1 << 23) + 1, (1 << 23) + 2, (1 << 23) + 3,
+      (1 << 24) - 3,(1 << 24) - 2,(1 << 24) - 1, (1 << 24), (1 << 24) + 1, (1 << 24) + 2, (1 << 24) + 3,
+      (1 << 30) - 3,(1 << 30) - 2,(1 << 30) - 1, (1 << 30), (1 << 30) + 1, (1 << 30) + 2, (1 << 30) + 3,
+      INT_MAX - 3, INT_MAX - 2, INT_MAX - 1, INT_MAX, // 0x80000000, 0x80000001 0x80000002 already covered above
+      UINT_MAX - 3, UINT_MAX - 2, UINT_MAX - 1, UINT_MAX
 };
 
-clampd gClampDouble[ kTypeCount ][kRoundingModeCount] = {
-    {    clampd_uchar,    clampd_uchar_rte,    clampd_uchar_rtp,    clampd_uchar_rtn,    clampd_uchar_rtz,        },
-    {    clampd_char,    clampd_char_rte,    clampd_char_rtp,    clampd_char_rtn,    clampd_char_rtz,        },
-    {    clampd_ushort,    clampd_ushort_rte,    clampd_ushort_rtp,    clampd_ushort_rtn,    clampd_ushort_rtz,        },
-    {    clampd_short,    clampd_short_rte,    clampd_short_rtp,    clampd_short_rtn,    clampd_short_rtz,        },
-    {    clampd_uint,    clampd_uint_rte,    clampd_uint_rtp,    clampd_uint_rtn,    clampd_uint_rtz,        },
-    {    clampd_int,     clampd_int_rte,     clampd_int_rtp,     clampd_int_rtn,     clampd_int_rtz,         },
-    {    clampd_float,    clampd_float_rte,    clampd_float_rtp,    clampd_float_rtn,    clampd_float_rtz,        },
-    {    clampd_double,    clampd_double_rte,    clampd_double_rtp,    clampd_double_rtn,    clampd_double_rtz,        },
-    {    clampd_ulong,    clampd_ulong_rte,    clampd_ulong_rtp,    clampd_ulong_rtn,    clampd_ulong_rtz,        },
-    {    clampd_long,    clampd_long_rte,    clampd_long_rtp,    clampd_long_rtn,    clampd_long_rtz,        }
+std::vector<float> DataInitInfo::specialValuesFloat = {
+    -NAN, -INFINITY, -FLT_MAX,
+    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
+    -1000.f, -100.f, -4.0f, -3.5f, -3.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24), MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), -1.0f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25), MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), -0.5f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26), MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), -0.25f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27), MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN,
+    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150), MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
+    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
+    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f, +NAN, +INFINITY, +FLT_MAX,
+    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
+    +1000.f, +100.f, +4.0f, +3.5f, +3.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23), +2.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.5f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), +0.5f, MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), +0.25f, MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150), MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
+    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
+    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f
 };
 
-#if defined (_WIN32)
-#define __attribute__(X)
-#endif
-
-static inline float fclamp( float lo, float v, float hi ) __attribute__ ((always_inline));
-static inline double dclamp( double lo, double v, double hi ) __attribute__ ((always_inline));
-
-static inline float fclamp( float lo, float v, float hi ){ v = v < lo ? lo : v; return v < hi ? v : hi; }
-static inline double dclamp( double lo, double v, double hi ){ v = v < lo ? lo : v; return v < hi ? v : hi; }
-
-// Clamp unsaturated inputs into range so we don't get test errors:
-float clampf_uchar( float f )       { return fclamp( -0.5f, f, 255.5f - 128.0f * FLT_EPSILON ); }
-double clampd_uchar( double f )     { return dclamp( -0.5, f, 255.5 - 128.0 * DBL_EPSILON ); }
-float clampf_uchar_rte( float f )   { return fclamp( -0.5f, f, 255.5f - 128.0f * FLT_EPSILON ); }
-double clampd_uchar_rte( double f ) { return dclamp( -0.5, f, 255.5 - 128.0 * DBL_EPSILON ); }
-float clampf_uchar_rtp( float f )   { return fclamp( -1.0f + FLT_EPSILON/2.0f, f, 255.0f ); }
-double clampd_uchar_rtp( double f ) { return dclamp( -1.0 + DBL_EPSILON/2.0, f, 255.0 ); }
-float clampf_uchar_rtn( float f )   { return fclamp( -0.0f, f, 256.0f - 128.0f * FLT_EPSILON); }
-double clampd_uchar_rtn( double f ) { return dclamp( -0.0, f, 256.0 - 128.0 * DBL_EPSILON); }
-float clampf_uchar_rtz( float f )   { return fclamp( -1.0f + FLT_EPSILON/2.0f, f, 256.0f - 128.0f * FLT_EPSILON); }
-double clampd_uchar_rtz( double f ) { return dclamp( -1.0 + DBL_EPSILON/2.0, f, 256.0 - 128.0f * DBL_EPSILON); }
-
-float clampf_char( float f )        { return fclamp( -128.5f, f, 127.5f - 64.f * FLT_EPSILON ); }
-double clampd_char( double f )      { return dclamp( -128.5, f, 127.5 - 64. * DBL_EPSILON ); }
-float clampf_char_rte( float f )    { return fclamp( -128.5f, f, 127.5f - 64.f * FLT_EPSILON ); }
-double clampd_char_rte( double f )  { return dclamp( -128.5, f, 127.5 - 64. * DBL_EPSILON ); }
-float clampf_char_rtp( float f )    { return fclamp( -129.0f + 128.f*FLT_EPSILON, f, 127.f ); }
-double clampd_char_rtp( double f )  { return dclamp( -129.0 + 128.*DBL_EPSILON, f, 127. ); }
-float clampf_char_rtn( float f )    { return fclamp( -128.0f, f, 128.f - 64.0f*FLT_EPSILON ); }
-double clampd_char_rtn( double f )  { return dclamp( -128.0, f, 128. - 64.0*DBL_EPSILON ); }
-float clampf_char_rtz( float f )    { return fclamp( -129.0f + 128.f*FLT_EPSILON, f, 128.f - 64.0f*FLT_EPSILON ); }
-double clampd_char_rtz( double f )  { return dclamp( -129.0 + 128.*DBL_EPSILON, f, 128. - 64.0*DBL_EPSILON ); }
-
-float clampf_ushort( float f )       { return fclamp( -0.5f, f, 65535.5f - 32768.0f * FLT_EPSILON ); }
-double clampd_ushort( double f )     { return dclamp( -0.5, f, 65535.5 - 32768.0 * DBL_EPSILON ); }
-float clampf_ushort_rte( float f )   { return fclamp( -0.5f, f, 65535.5f - 32768.0f * FLT_EPSILON ); }
-double clampd_ushort_rte( double f ) { return dclamp( -0.5, f, 65535.5 - 32768.0 * DBL_EPSILON ); }
-float clampf_ushort_rtp( float f )   { return fclamp( -1.0f + FLT_EPSILON/2.0f, f, 65535.0f ); }
-double clampd_ushort_rtp( double f ) { return dclamp( -1.0 + DBL_EPSILON/2.0, f, 65535.0 ); }
-float clampf_ushort_rtn( float f )   { return fclamp( -0.0f, f, 65536.0f - 32768.0f * FLT_EPSILON); }
-double clampd_ushort_rtn( double f ) { return dclamp( -0.0, f, 65536.0 - 32768.0 * DBL_EPSILON); }
-float clampf_ushort_rtz( float f )   { return fclamp( -1.0f + FLT_EPSILON/2.0f, f, 65536.0f - 32768.0f * FLT_EPSILON); }
-double clampd_ushort_rtz( double f ) { return dclamp( -1.0 + DBL_EPSILON/2.0, f, 65536.0 - 32768.0f * DBL_EPSILON); }
-
-float clampf_short( float f )        { return fclamp( -32768.5f, f, 32767.5f - 16384.f * FLT_EPSILON ); }
-double clampd_short( double f )      { return dclamp( -32768.5, f, 32767.5 - 16384. * DBL_EPSILON ); }
-float clampf_short_rte( float f )    { return fclamp( -32768.5f, f, 32767.5f - 16384.f * FLT_EPSILON ); }
-double clampd_short_rte( double f )  { return dclamp( -32768.5, f, 32767.5 - 16384. * DBL_EPSILON ); }
-float clampf_short_rtp( float f )    { return fclamp( -32769.0f + 32768.f*FLT_EPSILON, f, 32767.f ); }
-double clampd_short_rtp( double f )  { return dclamp( -32769.0 + 32768.*DBL_EPSILON, f, 32767. ); }
-float clampf_short_rtn( float f )    { return fclamp( -32768.0f, f, 32768.f - 16384.0f*FLT_EPSILON ); }
-double clampd_short_rtn( double f )  { return dclamp( -32768.0, f, 32768. - 16384.0*DBL_EPSILON ); }
-float clampf_short_rtz( float f )    { return fclamp( -32769.0f + 32768.f*FLT_EPSILON, f, 32768.f - 16384.0f*FLT_EPSILON ); }
-double clampd_short_rtz( double f )  { return dclamp( -32769.0 + 32768.*DBL_EPSILON, f, 32768. - 16384.0*DBL_EPSILON ); }
-
-float clampf_uint( float f )        { return fclamp( -0.5f, f, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7) ); }
-double clampd_uint( double f )      { return dclamp( -0.5, f, CL_UINT_MAX + 0.5 - MAKE_HEX_DOUBLE(0x1.0p31, 0x1LL, 31) * DBL_EPSILON ); }
-float clampf_uint_rte( float f )    { return fclamp( -0.5f, f, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7) ); }
-double clampd_uint_rte( double f )  { return dclamp( -0.5, f, CL_UINT_MAX + 0.5 - MAKE_HEX_DOUBLE(0x1.0p31, 0x1LL, 31) * DBL_EPSILON ); }
-float clampf_uint_rtp( float f )    { return fclamp( -1.0f + FLT_EPSILON/2.0f, f, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7) ); }
-double clampd_uint_rtp( double f )  { return dclamp( -1.0 + DBL_EPSILON/2.0, f, CL_UINT_MAX ); }
-float clampf_uint_rtn( float f )    { return fclamp( -0.0f, f, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7)); }
-double clampd_uint_rtn( double f )  { return dclamp( -0.0, f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp31, 0x1fffffffffffffLL, -21) ); }
-float clampf_uint_rtz( float f )    { return fclamp( -1.0f + FLT_EPSILON/2.0f, f, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7)); }
-double clampd_uint_rtz( double f )  { return dclamp( -1.0 + DBL_EPSILON/2.0, f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp31, 0x1fffffffffffffLL, -21)); }
-
-float clampf_int( float f )         { return fclamp( INT_MIN, f, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6) ); }
-double clampd_int( double f )       { return dclamp( INT_MIN - 0.5, f, CL_INT_MAX + 0.5 - MAKE_HEX_DOUBLE(0x1.0p30, 0x1LL, 30) * DBL_EPSILON ); }
-float clampf_int_rte( float f )     { return fclamp( INT_MIN, f, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6) ); }
-double clampd_int_rte( double f )   { return dclamp( INT_MIN - 0.5, f, CL_INT_MAX + 0.5 - MAKE_HEX_DOUBLE(0x1.0p30, 0x1LL, 30) * DBL_EPSILON ); }
-float clampf_int_rtp( float f )     { return fclamp( INT_MIN, f, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6) ); }
-double clampd_int_rtp( double f )   { return dclamp( INT_MIN - 1.0 + DBL_EPSILON * MAKE_HEX_DOUBLE(0x1.0p31, 0x1LL, 31), f, CL_INT_MAX ); }
-float clampf_int_rtn( float f )     { return fclamp( INT_MIN, f, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6) ); }
-double clampd_int_rtn( double f )   { return dclamp( INT_MIN, f, CL_INT_MAX + 1.0 - MAKE_HEX_DOUBLE(0x1.0p30, 0x1LL, 30) * DBL_EPSILON ); }
-float clampf_int_rtz( float f )     { return fclamp( INT_MIN, f, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6) ); }
-double clampd_int_rtz( double f )   { return dclamp( INT_MIN - 1.0 + DBL_EPSILON * MAKE_HEX_DOUBLE(0x1.0p31, 0x1LL, 31), f, CL_INT_MAX + 1.0 - MAKE_HEX_DOUBLE(0x1.0p30, 0x1LL, 30) * DBL_EPSILON ); }
-
-float clampf_float( float f ){ return f; }
-double clampd_float( double f ){ return f; }
-float clampf_float_rte( float f ){ return f; }
-double clampd_float_rte( double f ){ return f; }
-float clampf_float_rtp( float f ){ return f; }
-double clampd_float_rtp( double f ){ return f; }
-float clampf_float_rtn( float f ){ return f; }
-double clampd_float_rtn( double f ){ return f; }
-float clampf_float_rtz( float f ){ return f; }
-double clampd_float_rtz( double f ){ return f; }
-
-float clampf_double( float f ){ return f; }
-double clampd_double( double f ){ return f; }
-float clampf_double_rte( float f ){ return f; }
-double clampd_double_rte( double f ){ return f; }
-float clampf_double_rtp( float f ){ return f; }
-double clampd_double_rtp( double f ){ return f; }
-float clampf_double_rtn( float f ){ return f; }
-double clampd_double_rtn( double f ){ return f; }
-float clampf_double_rtz( float f ){ return f; }
-double clampd_double_rtz( double f ){ return f; }
-
-float clampf_ulong( float f )       { return fclamp( -0.5f, f, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39) ); }
-double clampd_ulong( double f )     { return dclamp( -0.5, f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11) ); }
-float clampf_ulong_rte( float f )   { return fclamp( -0.5f, f, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39) ); }
-double clampd_ulong_rte( double f ) { return dclamp( -0.5, f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11) ); }
-float clampf_ulong_rtp( float f )   { return fclamp( -1.0f + FLT_EPSILON/2.0f, f, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39) ); }
-double clampd_ulong_rtp( double f ) { return dclamp( -1.0 + DBL_EPSILON/2.0, f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11) ); }
-float clampf_ulong_rtn( float f )   { return fclamp( -0.0f, f, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39) ); }
-double clampd_ulong_rtn( double f ) { return dclamp( -0.0, f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11) ); }
-float clampf_ulong_rtz( float f )   { return fclamp( -1.0f + FLT_EPSILON/2.0f, f, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39) ); }
-double clampd_ulong_rtz( double f ) { return dclamp( -1.0 + DBL_EPSILON/2.0, f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11) ); }
-
-float clampf_long( float f )        { return fclamp( MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), f, MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38) ); }
-double clampd_long( double f )      { return dclamp( MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10) ); }
-float clampf_long_rte( float f )    { return fclamp( MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), f, MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38) ); }
-double clampd_long_rte( double f )  { return dclamp( MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10) ); }
-float clampf_long_rtp( float f )    { return fclamp( MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), f, MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38) ); }
-double clampd_long_rtp( double f )  { return dclamp( MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10) ); }
-float clampf_long_rtn( float f )    { return fclamp( MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), f, MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38) ); }
-double clampd_long_rtn( double f )  { return dclamp( MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10) ); }
-float clampf_long_rtz( float f )    { return fclamp( MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), f, MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38) ); }
-double clampd_long_rtz( double f )  { return dclamp( MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10) ); }
-
-#pragma mark -
-
-int alwaysPass( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int alwaysFail( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_uchar( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_char( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_ushort( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_short( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_uint( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_int( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_ulong( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_long( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_float( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_double( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-
-void init_uchar( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_char( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_ushort( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_short( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_uint( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_int( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_float( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_double( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_ulong( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_long( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-
-InitDataFunc gInitFunctions[ kTypeCount ] = {
-                                                init_uchar, init_char,
-                                                init_ushort, init_short,
-                                                init_uint, init_int,
-                                                init_float, init_double,
-                                                init_ulong, init_long
-                                            };
-
-
-CheckResults gCheckResults[ kTypeCount ] = {
-                                                check_uchar, check_char, check_ushort, check_short, check_uint,
-                                                check_int, check_float, check_double, check_ulong, check_long
-                                            };
-#if !defined (__APPLE__)
-#define UNUSED
-#else
-#define UNUSED  __attribute__((unused))
-#endif
-
-int alwaysPass( void UNUSED *out1, void UNUSED *out2, void UNUSED *allowZ, uint32_t UNUSED count, int UNUSED vectorSize){ return 0; }
-int alwaysFail( void UNUSED *out1, void UNUSED *out2, void UNUSED *allowZ, uint32_t UNUSED count, int UNUSED vectorSize ){ return -1; }
-
-int check_uchar( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_uchar *t = (const cl_uchar*)test;
-    const cl_uchar *c = (const cl_uchar*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
-
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (cl_uchar)0))
-        {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *0x%2.2x vs 0x%2.2x\n", vectorSize, i, c[i], t[i] );
-            return i + 1;
-        }
-
-    return 0;
-}
-
-int check_char( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_char *t = (const cl_char*)test;
-    const cl_char *c = (const cl_char*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
-
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (cl_char)0))
-        {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *0x%2.2x vs 0x%2.2x\n", vectorSize, i, c[i], t[i] );
-            return i + 1;
-        }
-
-    return 0;
-}
-
-int check_ushort( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_ushort *t = (const cl_ushort*)test;
-    const cl_ushort *c = (const cl_ushort*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
-
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (cl_ushort)0))
-        {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *0x%4.4x vs 0x%4.4x\n", vectorSize, i, c[i], t[i] );
-            return i + 1;
-        }
-
-    return 0;
-}
-
-int check_short( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_short *t = (const cl_short*)test;
-    const cl_short *c = (const cl_short*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
-
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (cl_short)0))
-        {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *0x%4.4x vs 0x%4.4x\n", vectorSize, i, c[i], t[i] );
-            return i + 1;
-        }
-
-    return 0;
-}
-
-int check_uint( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_uint *t = (const cl_uint*)test;
-    const cl_uint *c = (const cl_uint*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
-
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (cl_uint)0))
-        {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *0x%8.8x vs 0x%8.8x\n", vectorSize, i, c[i], t[i] );
-            return i + 1;
-        }
-
-    return 0;
-}
-
-int check_int( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_int *t = (const cl_int*)test;
-    const cl_int *c = (const cl_int*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
-
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (cl_int)0))
-        {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *0x%8.8x vs 0x%8.8x\n", vectorSize, i, c[i], t[i] );
-            return i + 1;
-        }
-
-    return 0;
-}
-
-int check_ulong( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_ulong *t = (const cl_ulong*)test;
-    const cl_ulong *c = (const cl_ulong*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
-
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (cl_ulong)0))
-        {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *0x%16.16llx vs 0x%16.16llx\n", vectorSize, i, c[i], t[i] );
-            return i + 1;
-        }
-
-    return 0;
-}
-
-int check_long( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_long *t = (const cl_long*)test;
-    const cl_long *c = (const cl_long*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
-
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (cl_long)0))
-        {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *0x%16.16llx vs 0x%16.16llx\n", vectorSize, i, c[i], t[i] );
-            return i + 1;
-        }
-
-    return 0;
-}
-
-int check_float( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_uint *t = (const cl_uint*)test;
-    const cl_uint *c = (const cl_uint*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
-
-    for( i = 0; i < count; i++ )
-        if (t[i] != c[i] &&
-            // Allow nan's to be binary different
-            !((t[i] & 0x7fffffffU) > 0x7f800000U &&
-              (c[i] & 0x7fffffffU) > 0x7f800000U) &&
-            !(a[i] != (cl_uchar)0 &&
-              t[i] == (c[i] & 0x80000000U))) {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *%a vs %a\n",
-                    vectorSize, i, ((float*)correct)[i], ((float*)test)[i] );
-            return i + 1;
-        }
-
-    return 0;
-}
-
-int check_double( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_ulong *t = (const cl_ulong*)test;
-    const cl_ulong *c = (const cl_ulong*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
-
-    for( i = 0; i < count; i++ )
-        if (t[i] != c[i] &&
-            // Allow nan's to be binary different
-            !((t[i] & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL &&
-              (c[i] & 0x7fffffffffffffffULL) > 0x7f80000000000000ULL) &&
-            !(a[i] != (cl_uchar)0 &&
-              t[i] == (c[i] & 0x8000000000000000ULL))) {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *%a vs %a\n",
-                  vectorSize, i, ((double*)correct)[i], ((double*)test)[i] );
-            return i + 1;
-        }
-
-    return 0;
-}
-
-
-void init_uchar( void *out, SaturationMode UNUSED sat, RoundingMode UNUSED round, Type UNUSED destType, uint64_t start, int count, MTdata UNUSED d )
-{
-    cl_uchar *o = (cl_uchar *)out;
-    int i;
-
-    for( i = 0; i < count; i++ )
-        o[i] = start++;
-}
-
-void init_char( void *out, SaturationMode UNUSED sat, RoundingMode UNUSED round, Type UNUSED destType, uint64_t start, int count, MTdata UNUSED d )
-{
-    char *o = (char *)out;
-    int i;
-
-    for( i = 0; i < count; i++ )
-        o[i] = start++;
-}
-
-void init_ushort( void *out, SaturationMode UNUSED sat, RoundingMode UNUSED round, Type UNUSED destType, uint64_t start, int count, MTdata UNUSED d )
-{
-    cl_ushort *o = (cl_ushort *)out;
-    int i;
-
-    for( i = 0; i < count; i++ )
-        o[i] = start++;
-}
-
-void init_short( void *out, SaturationMode UNUSED sat, RoundingMode UNUSED round, UNUSED Type destType, uint64_t start, int count, MTdata UNUSED d )
-{
-    short *o = (short *)out;
-    int i;
-
-    for( i = 0; i < count; i++ )
-        o[i] = start++;
-}
-
-void init_uint( void *out, SaturationMode UNUSED sat, RoundingMode UNUSED round, Type UNUSED destType, uint64_t start, int count, MTdata d )
-{
-    static const unsigned int specialValuesUInt[] = {
-    INT_MIN, INT_MIN + 1, INT_MIN + 2,
-    -(1<<30)-3,-(1<<30)-2,-(1<<30)-1, -(1<<30), -(1<<30)+1, -(1<<30)+2, -(1<<30)+3,
-    -(1<<24)-3,-(1<<24)-2,-(1<<24)-1, -(1<<24), -(1<<24)+1, -(1<<24)+2, -(1<<24)+3,
-    -(1<<23)-3,-(1<<23)-2,-(1<<23)-1, -(1<<23), -(1<<23)+1, -(1<<23)+2, -(1<<23)+3,
-    -(1<<22)-3,-(1<<22)-2,-(1<<22)-1, -(1<<22), -(1<<22)+1, -(1<<22)+2, -(1<<22)+3,
-    -(1<<21)-3,-(1<<21)-2,-(1<<21)-1, -(1<<21), -(1<<21)+1, -(1<<21)+2, -(1<<21)+3,
-    -(1<<16)-3,-(1<<16)-2,-(1<<16)-1, -(1<<16), -(1<<16)+1, -(1<<16)+2, -(1<<16)+3,
-    -(1<<15)-3,-(1<<15)-2,-(1<<15)-1, -(1<<15), -(1<<15)+1, -(1<<15)+2, -(1<<15)+3,
-    -(1<<8)-3,-(1<<8)-2,-(1<<8)-1, -(1<<8), -(1<<8)+1, -(1<<8)+2, -(1<<8)+3,
-    -(1<<7)-3,-(1<<7)-2,-(1<<7)-1, -(1<<7), -(1<<7)+1, -(1<<7)+2, -(1<<7)+3,
-    -4, -3, -2, -1, 0, 1, 2, 3, 4,
-    (1<<7)-3,(1<<7)-2,(1<<7)-1, (1<<7), (1<<7)+1, (1<<7)+2, (1<<7)+3,
-    (1<<8)-3,(1<<8)-2,(1<<8)-1, (1<<8), (1<<8)+1, (1<<8)+2, (1<<8)+3,
-    (1<<15)-3,(1<<15)-2,(1<<15)-1, (1<<15), (1<<15)+1, (1<<15)+2, (1<<15)+3,
-    (1<<16)-3,(1<<16)-2,(1<<16)-1, (1<<16), (1<<16)+1, (1<<16)+2, (1<<16)+3,
-    (1<<21)-3,(1<<21)-2,(1<<21)-1, (1<<21), (1<<21)+1, (1<<21)+2, (1<<21)+3,
-    (1<<22)-3,(1<<22)-2,(1<<22)-1, (1<<22), (1<<22)+1, (1<<22)+2, (1<<22)+3,
-    (1<<23)-3,(1<<23)-2,(1<<23)-1, (1<<23), (1<<23)+1, (1<<23)+2, (1<<23)+3,
-    (1<<24)-3,(1<<24)-2,(1<<24)-1, (1<<24), (1<<24)+1, (1<<24)+2, (1<<24)+3,
-    (1<<30)-3,(1<<30)-2,(1<<30)-1, (1<<30), (1<<30)+1, (1<<30)+2, (1<<30)+3,
-    INT_MAX-3, INT_MAX-2, INT_MAX-1, INT_MAX, // 0x80000000, 0x80000001 0x80000002 already covered above
-    UINT_MAX-3, UINT_MAX-2, UINT_MAX-1, UINT_MAX
-    };
-
-    cl_uint *o = (cl_uint *)out;
-    int i;
-
-    for( i = 0; i < count; i++) {
-    if( gIsEmbedded )
-        o[i] = (cl_uint) genrand_int32(d);
-    else
-        o[i] = (cl_uint)i + start;
-    }
-
-    if( 0 == start )
-    {
-    size_t tableSize = sizeof( specialValuesUInt );
-    if( sizeof( cl_uint) * count < tableSize )
-        tableSize = sizeof( cl_uint) * count;
-    memcpy( (char*)(o + i) - tableSize, specialValuesUInt, tableSize );
-    }
-}
-
-void init_int( void *out, SaturationMode UNUSED sat, RoundingMode UNUSED round, Type UNUSED destType, uint64_t start, int count, MTdata d )
-{
-    static const unsigned int specialValuesInt[] = {
-    INT_MIN, INT_MIN + 1, INT_MIN + 2,
-    -(1<<30)-3,-(1<<30)-2,-(1<<30)-1, -(1<<30), -(1<<30)+1, -(1<<30)+2, -(1<<30)+3,
-    -(1<<24)-3,-(1<<24)-2,-(1<<24)-1, -(1<<24), -(1<<24)+1, -(1<<24)+2, -(1<<24)+3,
-    -(1<<23)-3,-(1<<23)-2,-(1<<23)-1, -(1<<23), -(1<<23)+1, -(1<<23)+2, -(1<<23)+3,
-    -(1<<22)-3,-(1<<22)-2,-(1<<22)-1, -(1<<22), -(1<<22)+1, -(1<<22)+2, -(1<<22)+3,
-    -(1<<21)-3,-(1<<21)-2,-(1<<21)-1, -(1<<21), -(1<<21)+1, -(1<<21)+2, -(1<<21)+3,
-    -(1<<16)-3,-(1<<16)-2,-(1<<16)-1, -(1<<16), -(1<<16)+1, -(1<<16)+2, -(1<<16)+3,
-    -(1<<15)-3,-(1<<15)-2,-(1<<15)-1, -(1<<15), -(1<<15)+1, -(1<<15)+2, -(1<<15)+3,
-    -(1<<8)-3,-(1<<8)-2,-(1<<8)-1, -(1<<8), -(1<<8)+1, -(1<<8)+2, -(1<<8)+3,
-    -(1<<7)-3,-(1<<7)-2,-(1<<7)-1, -(1<<7), -(1<<7)+1, -(1<<7)+2, -(1<<7)+3,
-    -4, -3, -2, -1, 0, 1, 2, 3, 4,
-    (1<<7)-3,(1<<7)-2,(1<<7)-1, (1<<7), (1<<7)+1, (1<<7)+2, (1<<7)+3,
-    (1<<8)-3,(1<<8)-2,(1<<8)-1, (1<<8), (1<<8)+1, (1<<8)+2, (1<<8)+3,
-    (1<<15)-3,(1<<15)-2,(1<<15)-1, (1<<15), (1<<15)+1, (1<<15)+2, (1<<15)+3,
-    (1<<16)-3,(1<<16)-2,(1<<16)-1, (1<<16), (1<<16)+1, (1<<16)+2, (1<<16)+3,
-    (1<<21)-3,(1<<21)-2,(1<<21)-1, (1<<21), (1<<21)+1, (1<<21)+2, (1<<21)+3,
-    (1<<22)-3,(1<<22)-2,(1<<22)-1, (1<<22), (1<<22)+1, (1<<22)+2, (1<<22)+3,
-    (1<<23)-3,(1<<23)-2,(1<<23)-1, (1<<23), (1<<23)+1, (1<<23)+2, (1<<23)+3,
-    (1<<24)-3,(1<<24)-2,(1<<24)-1, (1<<24), (1<<24)+1, (1<<24)+2, (1<<24)+3,
-    (1<<30)-3,(1<<30)-2,(1<<30)-1, (1<<30), (1<<30)+1, (1<<30)+2, (1<<30)+3,
-    INT_MAX-3, INT_MAX-2, INT_MAX-1, INT_MAX, // 0x80000000, 0x80000001 0x80000002 already covered above
-    UINT_MAX-3, UINT_MAX-2, UINT_MAX-1, UINT_MAX
-    };
-
-    int *o = (int *)out;
-    int i;
-
-    for( i = 0; i < count; i++ ) {
-    if( gIsEmbedded ) {
-        o[i] = (int) genrand_int32(d);
-    }
-    else {
-        o[i] = (int) i + start;
-    }
-    }
-
-    if( 0 == start )
-    {
-    size_t tableSize = sizeof( specialValuesInt );
-    if( sizeof( int) * count < tableSize )
-        tableSize = sizeof( int) * count;
-    memcpy( (char*)(o + i) - tableSize, specialValuesInt, tableSize );
-    }
-}
-
-void init_float( void *out, SaturationMode sat, RoundingMode round, Type destType, uint64_t start, int count, MTdata d )
-{
-    static const float specialValuesFloat[] = {
-    -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39), MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), -1000.f, -100.f, -4.0f, -3.5f,
-    -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.5f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), -0.5f, MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26), MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), -0.25f, MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
-    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f,
-    +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), +1000.f, +100.f, +4.0f, +3.5f,
-    +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.5f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), +0.5f, MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), +0.25f, MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
-    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f
-    };
-
-    cl_uint *o = (cl_uint *)out;
-    int i;
-
-    for( i = 0; i < count; i++ ) {
-    if( gIsEmbedded )
-        o[i] = (cl_uint) genrand_int32(d);
-    else
-        o[i] = (cl_uint) i + start;
-    }
-
-    if( 0 == start )
-    {
-    size_t tableSize = sizeof( specialValuesFloat );
-    if( sizeof( float) * count < tableSize )
-        tableSize = sizeof( float) * count;
-    memcpy( (char*)(o + i) - tableSize, specialValuesFloat, tableSize );
-    }
-
-    if( kUnsaturated == sat )
-    {
-        clampf func = gClampFloat[ destType ][round];
-        float *f = (float *)out;
-
-        for( i = 0; i < count; i++ )
-            f[i] = func( f[i] );
-    }
-}
-
-// used to convert a bucket of bits into a search pattern through double
-static inline double DoubleFromUInt32( uint32_t bits );
-static inline double DoubleFromUInt32( uint32_t bits )
-{
-    union{ uint64_t u; double d;} u;
-
-    // split 0x89abcdef to 0x89abc00000000def
-    u.u = bits & 0xfffU;
-    u.u |= (uint64_t) (bits & ~0xfffU) << 32;
-
-    // sign extend the leading bit of def segment as sign bit so that the middle region consists of either all 1s or 0s
-    u.u -= (bits & 0x800U) << 1;
-
-    // return result
-    return u.d;
-}
-
 // A table of more difficult cases to get right
-static const double specialValuesDouble[] = {
-    -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), MAKE_HEX_DOUBLE(-0x1.80000000000001p64, -0x180000000000001LL, 8),
-    MAKE_HEX_DOUBLE(-0x1.8p64, -0x18LL, 60), MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp64, -0x17ffffffffffffLL, 12),     MAKE_HEX_DOUBLE(-0x1.80000000000001p63, -0x180000000000001LL, 7), MAKE_HEX_DOUBLE(-0x1.8p63, -0x18LL, 59), MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp63, -0x17ffffffffffffLL, 11),
-     MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), MAKE_HEX_DOUBLE(-0x1.80000000000001p32, -0x180000000000001LL, -24), MAKE_HEX_DOUBLE(-0x1.8p32, -0x18LL, 28), MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp32, -0x17ffffffffffffLL, -20),
-    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.80000000000001p31, -0x180000000000001LL, -25), MAKE_HEX_DOUBLE(-0x1.8p31, -0x18LL, 27), MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp31, -0x17ffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), -1000., -100.,  -4.0, -3.5,
-    -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), -0.5, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), -0.25, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0,
-
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), MAKE_HEX_DOUBLE(0x1.80000000000001p63, 0x180000000000001LL, 7), MAKE_HEX_DOUBLE(0x1.8p63, 0x18LL, 59), MAKE_HEX_DOUBLE(0x1.7ffffffffffffp63, 0x17ffffffffffffLL, 11),  MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-     MAKE_HEX_DOUBLE(+0x1.80000000000001p32, +0x180000000000001LL, -24), MAKE_HEX_DOUBLE(+0x1.8p32, +0x18LL, 28), MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp32, +0x17ffffffffffffLL, -20),
-    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.80000000000001p31, +0x180000000000001LL, -25), MAKE_HEX_DOUBLE(+0x1.8p31, +0x18LL, 27), MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp31, +0x17ffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), +1000., +100.,  +4.0, +3.5,
-    +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), +0.5, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), +0.25, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0,
-
-    MAKE_HEX_DOUBLE(-0x1.ffffffffffffep62, -0x1ffffffffffffeLL, 10), MAKE_HEX_DOUBLE(-0x1.ffffffffffffcp62, -0x1ffffffffffffcLL, 10), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), MAKE_HEX_DOUBLE(+0x1.ffffffffffffep62, +0x1ffffffffffffeLL, 10), MAKE_HEX_DOUBLE(+0x1.ffffffffffffcp62, +0x1ffffffffffffcLL, 10), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(-0x1.ffffffffffffep51, -0x1ffffffffffffeLL, -1), MAKE_HEX_DOUBLE(-0x1.ffffffffffffcp51, -0x1ffffffffffffcLL, -1), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp51, -0x1fffffffffffffLL, -1), MAKE_HEX_DOUBLE(+0x1.ffffffffffffep51, +0x1ffffffffffffeLL, -1), MAKE_HEX_DOUBLE(+0x1.ffffffffffffcp51, +0x1ffffffffffffcLL, -1), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp51, +0x1fffffffffffffLL, -1),
-    MAKE_HEX_DOUBLE(-0x1.ffffffffffffep52, -0x1ffffffffffffeLL, 0), MAKE_HEX_DOUBLE(-0x1.ffffffffffffcp52, -0x1ffffffffffffcLL, 0), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp52, -0x1fffffffffffffLL, 0), MAKE_HEX_DOUBLE(+0x1.ffffffffffffep52, +0x1ffffffffffffeLL, 0), MAKE_HEX_DOUBLE(+0x1.ffffffffffffcp52, +0x1ffffffffffffcLL, 0), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp52, +0x1fffffffffffffLL, 0),
-    MAKE_HEX_DOUBLE(-0x1.ffffffffffffep53, -0x1ffffffffffffeLL, 1), MAKE_HEX_DOUBLE(-0x1.ffffffffffffcp53, -0x1ffffffffffffcLL, 1), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp53, -0x1fffffffffffffLL, 1), MAKE_HEX_DOUBLE(+0x1.ffffffffffffep53, +0x1ffffffffffffeLL, 1), MAKE_HEX_DOUBLE(+0x1.ffffffffffffcp53, +0x1ffffffffffffcLL, 1), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp53, +0x1fffffffffffffLL, 1),
-    MAKE_HEX_DOUBLE(-0x1.0000000000002p52, -0x10000000000002LL, 0), MAKE_HEX_DOUBLE(-0x1.0000000000001p52, -0x10000000000001LL, 0), MAKE_HEX_DOUBLE(-0x1.0p52, -0x1LL, 52), MAKE_HEX_DOUBLE(+0x1.0000000000002p52, +0x10000000000002LL, 0), MAKE_HEX_DOUBLE(+0x1.0000000000001p52, +0x10000000000001LL, 0), MAKE_HEX_DOUBLE(+0x1.0p52, +0x1LL, 52),
-    MAKE_HEX_DOUBLE(-0x1.0000000000002p53, -0x10000000000002LL, 1), MAKE_HEX_DOUBLE(-0x1.0000000000001p53, -0x10000000000001LL, 1), MAKE_HEX_DOUBLE(-0x1.0p53, -0x1LL, 53), MAKE_HEX_DOUBLE(+0x1.0000000000002p53, +0x10000000000002LL, 1), MAKE_HEX_DOUBLE(+0x1.0000000000001p53, +0x10000000000001LL, 1), MAKE_HEX_DOUBLE(+0x1.0p53, +0x1LL, 53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000002p54, -0x10000000000002LL, 2), MAKE_HEX_DOUBLE(-0x1.0000000000001p54, -0x10000000000001LL, 2), MAKE_HEX_DOUBLE(-0x1.0p54, -0x1LL, 54), MAKE_HEX_DOUBLE(+0x1.0000000000002p54, +0x10000000000002LL, 2), MAKE_HEX_DOUBLE(+0x1.0000000000001p54, +0x10000000000001LL, 2), MAKE_HEX_DOUBLE(+0x1.0p54, +0x1LL, 54),
-    MAKE_HEX_DOUBLE(-0x1.fffffffefffffp62, -0x1fffffffefffffLL, 10), MAKE_HEX_DOUBLE(-0x1.ffffffffp62, -0x1ffffffffLL, 30), MAKE_HEX_DOUBLE(-0x1.ffffffff00001p62, -0x1ffffffff00001LL, 10), MAKE_HEX_DOUBLE(0x1.fffffffefffffp62, 0x1fffffffefffffLL, 10), MAKE_HEX_DOUBLE(0x1.ffffffffp62, 0x1ffffffffLL, 30), MAKE_HEX_DOUBLE(0x1.ffffffff00001p62, 0x1ffffffff00001LL, 10),
+std::vector<double> DataInitInfo::specialValuesDouble = {
+    -NAN, -INFINITY, -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), MAKE_HEX_DOUBLE(-0x1.80000000000001p64, -0x180000000000001LL, 8),
+    MAKE_HEX_DOUBLE(-0x1.8p64, -0x18LL, 60), MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp64, -0x17ffffffffffffLL, 12),
+    MAKE_HEX_DOUBLE(-0x1.80000000000001p63, -0x180000000000001LL, 7), MAKE_HEX_DOUBLE(-0x1.8p63, -0x18LL, 59),
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp63, -0x17ffffffffffffLL, 11), MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.80000000000001p32, -0x180000000000001LL, -24), MAKE_HEX_DOUBLE(-0x1.8p32, -0x18LL, 28),
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp32, -0x17ffffffffffffLL, -20), MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(-0x1.80000000000001p31, -0x180000000000001LL, -25), MAKE_HEX_DOUBLE(-0x1.8p31, -0x18LL, 27),
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp31, -0x17ffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
+    -1000., -100., -4.0, -3.5, -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52), MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), -0.5,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54), MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), -0.25,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55), MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(0x1.80000000000001p63, 0x180000000000001LL, 7), MAKE_HEX_DOUBLE(0x1.8p63, 0x18LL, 59),
+    MAKE_HEX_DOUBLE(0x1.7ffffffffffffp63, 0x17ffffffffffffLL, 11), MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.80000000000001p32, +0x180000000000001LL, -24), MAKE_HEX_DOUBLE(+0x1.8p32, +0x18LL, 28),
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp32, +0x17ffffffffffffLL, -20), MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(+0x1.80000000000001p31, +0x180000000000001LL, -25), MAKE_HEX_DOUBLE(+0x1.8p31, +0x18LL, 27),
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp31, +0x17ffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
+    +1000., +100., +4.0, +3.5, +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52), MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
+    +0.5, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54), MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
+    +0.25, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55), MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0, MAKE_HEX_DOUBLE(-0x1.ffffffffffffep62, -0x1ffffffffffffeLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.ffffffffffffcp62, -0x1ffffffffffffcLL, 10), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.ffffffffffffep62, +0x1ffffffffffffeLL, 10), MAKE_HEX_DOUBLE(+0x1.ffffffffffffcp62, +0x1ffffffffffffcLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), MAKE_HEX_DOUBLE(-0x1.ffffffffffffep51, -0x1ffffffffffffeLL, -1),
+    MAKE_HEX_DOUBLE(-0x1.ffffffffffffcp51, -0x1ffffffffffffcLL, -1), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp51, -0x1fffffffffffffLL, -1),
+    MAKE_HEX_DOUBLE(+0x1.ffffffffffffep51, +0x1ffffffffffffeLL, -1), MAKE_HEX_DOUBLE(+0x1.ffffffffffffcp51, +0x1ffffffffffffcLL, -1),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp51, +0x1fffffffffffffLL, -1), MAKE_HEX_DOUBLE(-0x1.ffffffffffffep52, -0x1ffffffffffffeLL, 0),
+    MAKE_HEX_DOUBLE(-0x1.ffffffffffffcp52, -0x1ffffffffffffcLL, 0), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp52, -0x1fffffffffffffLL, 0),
+    MAKE_HEX_DOUBLE(+0x1.ffffffffffffep52, +0x1ffffffffffffeLL, 0), MAKE_HEX_DOUBLE(+0x1.ffffffffffffcp52, +0x1ffffffffffffcLL, 0),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp52, +0x1fffffffffffffLL, 0), MAKE_HEX_DOUBLE(-0x1.ffffffffffffep53, -0x1ffffffffffffeLL, 1),
+    MAKE_HEX_DOUBLE(-0x1.ffffffffffffcp53, -0x1ffffffffffffcLL, 1), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp53, -0x1fffffffffffffLL, 1),
+    MAKE_HEX_DOUBLE(+0x1.ffffffffffffep53, +0x1ffffffffffffeLL, 1), MAKE_HEX_DOUBLE(+0x1.ffffffffffffcp53, +0x1ffffffffffffcLL, 1),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp53, +0x1fffffffffffffLL, 1), MAKE_HEX_DOUBLE(-0x1.0000000000002p52, -0x10000000000002LL, 0),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p52, -0x10000000000001LL, 0), MAKE_HEX_DOUBLE(-0x1.0p52, -0x1LL, 52),
+    MAKE_HEX_DOUBLE(+0x1.0000000000002p52, +0x10000000000002LL, 0), MAKE_HEX_DOUBLE(+0x1.0000000000001p52, +0x10000000000001LL, 0),
+    MAKE_HEX_DOUBLE(+0x1.0p52, +0x1LL, 52), MAKE_HEX_DOUBLE(-0x1.0000000000002p53, -0x10000000000002LL, 1),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p53, -0x10000000000001LL, 1), MAKE_HEX_DOUBLE(-0x1.0p53, -0x1LL, 53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000002p53, +0x10000000000002LL, 1), MAKE_HEX_DOUBLE(+0x1.0000000000001p53, +0x10000000000001LL, 1),
+    MAKE_HEX_DOUBLE(+0x1.0p53, +0x1LL, 53), MAKE_HEX_DOUBLE(-0x1.0000000000002p54, -0x10000000000002LL, 2),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p54, -0x10000000000001LL, 2), MAKE_HEX_DOUBLE(-0x1.0p54, -0x1LL, 54),
+    MAKE_HEX_DOUBLE(+0x1.0000000000002p54, +0x10000000000002LL, 2), MAKE_HEX_DOUBLE(+0x1.0000000000001p54, +0x10000000000001LL, 2),
+    MAKE_HEX_DOUBLE(+0x1.0p54, +0x1LL, 54), MAKE_HEX_DOUBLE(-0x1.fffffffefffffp62, -0x1fffffffefffffLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.ffffffffp62, -0x1ffffffffLL, 30), MAKE_HEX_DOUBLE(-0x1.ffffffff00001p62, -0x1ffffffff00001LL, 10),
+    MAKE_HEX_DOUBLE(0x1.fffffffefffffp62, 0x1fffffffefffffLL, 10), MAKE_HEX_DOUBLE(0x1.ffffffffp62, 0x1ffffffffLL, 30),
+    MAKE_HEX_DOUBLE(0x1.ffffffff00001p62, 0x1ffffffff00001LL, 10),
 };
+// clang-format on
 
 
-void init_double( void *out, SaturationMode sat, RoundingMode round, Type destType, uint64_t start, int count, MTdata UNUSED d )
+// Windows (since long double got deprecated) sets the x87 to 53-bit precision
+// (that's x87 default state).  This causes problems with the tests that
+// convert long and ulong to float and double or otherwise deal with values
+// that need more precision than 53-bit. So, set the x87 to 64-bit precision.
+static inline void Force64BitFPUPrecision(void)
 {
-    double *o = (double*)out;
-    int i;
-
-    for( i = 0; i < count; i++ )
-    {
-        uint64_t z = i + start;
-        o[i] = DoubleFromUInt32( (uint32_t) z ^ (uint32_t) (z >> 32));
-    }
-
-    if( 0 == start )
-    {
-        size_t tableSize = sizeof( specialValuesDouble );
-        if( sizeof( cl_double) * count < tableSize )
-            tableSize = sizeof( cl_double) * count;
-        memcpy( (char*)(o + i) - tableSize, specialValuesDouble, tableSize );
-    }
-
-    if( 0 == sat )
-    {
-        clampd func = gClampDouble[ destType ][round];
-
-        for( i = 0; i < count; i++ )
-            o[i] = func( o[i] );
-    }
+#if __MINGW32__
+    // The usual method is to use _controlfp as follows:
+    //     #include <float.h>
+    //     _controlfp(_PC_64, _MCW_PC);
+    //
+    // _controlfp is available on MinGW32 but not on MinGW64. Instead of having
+    // divergent code just use inline assembly which works for both.
+    unsigned short int orig_cw = 0;
+    unsigned short int new_cw = 0;
+    __asm__ __volatile__("fstcw %0" : "=m"(orig_cw));
+    new_cw = orig_cw | 0x0300; // set precision to 64-bit
+    __asm__ __volatile__("fldcw  %0" ::"m"(new_cw));
+#else
+    /* Implement for other platforms if needed */
+#endif
 }
 
-cl_ulong random64( MTdata d )
+
+template <typename InType, typename OutType>
+int CalcRefValsPat<InType, OutType>::check_result(void *test, uint32_t count,
+                                                  int vectorSize)
 {
-    return (cl_ulong) genrand_int32(d) | ((cl_ulong) genrand_int32(d) << 32);
-}
+    const cl_uchar *a = (const cl_uchar *)gAllowZ;
 
-void init_ulong( void *out, SaturationMode UNUSED sat, RoundingMode UNUSED round, Type UNUSED destType, uint64_t start, int count, MTdata d )
-{
-    cl_ulong *o = (cl_ulong *)out;
-    cl_ulong i, j, k;
-
-    i = 0;
-    if( start == 0 )
-    {
-        //Try various powers of two
-        for( j = 0; j < (cl_ulong) count && j < 8 * sizeof(cl_ulong); j++ )
-            o[j] = (cl_ulong) 1 << j;
-        i = j;
-
-        // try the complement of those
-        for( j = 0; i < (cl_ulong) count && j < 8 * sizeof(cl_ulong); j++ )
-            o[i++] = ~((cl_ulong) 1 << j);
-
-        //Try various negative powers of two
-        for( j = 0; i < (cl_ulong) count && j < 8 * sizeof(cl_ulong); j++ )
-            o[i++] = (cl_ulong) 0xFFFFFFFFFFFFFFFEULL << j;
-
-        //try various powers of two plus 1, shifted by various amounts
-        for( j = 0; i < (cl_ulong)count && j < 8 * sizeof(cl_ulong); j++ )
-            for( k = 0; i < (cl_ulong)count && k < 8 * sizeof(cl_ulong) - j; k++ )
-                o[i++] = (((cl_ulong) 1 << j) + 1) << k;
-
-        //try various powers of two minus 1
-        for( j = 0; i < (cl_ulong)count && j < 8 * sizeof(cl_ulong); j++ )
-            for( k = 0; i < (cl_ulong)count && k < 8 * sizeof(cl_ulong) - j; k++ )
-                o[i++] = (((cl_ulong) 1 << j) - 1) << k;
-
-        // Other patterns
-        cl_ulong pattern[] = { 0x3333333333333333ULL, 0x5555555555555555ULL, 0x9999999999999999ULL, 0x6666666666666666ULL, 0xccccccccccccccccULL, 0xaaaaaaaaaaaaaaaaULL };
-        cl_ulong mask[] = { 0xffffffffffffffffULL, 0xff00ff00ff00ff00ULL, 0xffff0000ffff0000ULL, 0xffffffff00000000ULL };
-        for( j = 0; i < (cl_ulong) count && j < sizeof(pattern) / sizeof( pattern[0]); j++ )
-            for( k = 0; i + 2 <= (cl_ulong) count && k < sizeof(mask) / sizeof( mask[0]); k++ )
+    if (std::is_integral<OutType>::value)
+    { // char/uchar/short/ushort/int/uint/long/ulong
+        const OutType *t = (const OutType *)test;
+        const OutType *c = (const OutType *)gRef;
+        for (uint32_t i = 0; i < count; i++)
+            if (t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (OutType)0))
             {
-                o[i++] = pattern[j] & mask[k];
-                o[i++] = pattern[j] & ~mask[k];
+                size_t s = sizeof(OutType) * 2;
+                std::stringstream sstr;
+                sstr << "\nError for vector size %d found at 0x%8.8x:  *0x%"
+                     << s << "." << s << "x vs 0x%" << s << "." << s << "x\n";
+                vlog(sstr.str().c_str(), vectorSize, i, c[i], t[i]);
+                return i + 1;
+            }
+    }
+    else if (std::is_same<OutType, cl_float>::value)
+    {
+        // cast to integral - from original test
+        const cl_uint *t = (const cl_uint *)test;
+        const cl_uint *c = (const cl_uint *)gRef;
+
+        for (uint32_t i = 0; i < count; i++)
+            if (t[i] != c[i] &&
+                // Allow nan's to be binary different
+                !((t[i] & 0x7fffffffU) > 0x7f800000U
+                  && (c[i] & 0x7fffffffU) > 0x7f800000U)
+                && !(a[i] != (cl_uchar)0 && t[i] == (c[i] & 0x80000000U)))
+            {
+                vlog(
+                    "\nError for vector size %d found at 0x%8.8x:  *%a vs %a\n",
+                    vectorSize, i, ((OutType *)gRef)[i], ((OutType *)test)[i]);
+                return i + 1;
+            }
+    }
+    else
+    {
+        const cl_ulong *t = (const cl_ulong *)test;
+        const cl_ulong *c = (const cl_ulong *)gRef;
+
+        for (uint32_t i = 0; i < count; i++)
+            if (t[i] != c[i] &&
+                // Allow nan's to be binary different
+                !((t[i] & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL
+                  && (c[i] & 0x7fffffffffffffffULL) > 0x7f80000000000000ULL)
+                && !(a[i] != (cl_uchar)0
+                     && t[i] == (c[i] & 0x8000000000000000ULL)))
+            {
+                vlog(
+                    "\nError for vector size %d found at 0x%8.8x:  *%a vs %a\n",
+                    vectorSize, i, ((OutType *)gRef)[i], ((OutType *)test)[i]);
+                return i + 1;
             }
     }
 
-    for( ; i < (cl_ulong) count; i++ )
-        o[i] = random64(d);
+    return 0;
 }
 
-void init_long( void *out, SaturationMode sat, RoundingMode round, Type destType, uint64_t start, int count, MTdata d )
+
+cl_uint RoundUpToNextPowerOfTwo(cl_uint x)
 {
-    init_ulong( out, sat, round, destType, start, count, d );
+    if (0 == (x & (x - 1))) return x;
+
+    while (x & (x - 1)) x &= x - 1;
+
+    return x + x;
 }
 
-// ======
 
-void uchar2uchar_many( void *out, void *in, size_t n);
-void uchar2uchar_sat_many( void *out, void *in, size_t n);
-void char2uchar_many( void *out, void *in, size_t n);
-void char2uchar_sat_many( void *out, void *in, size_t n);
-void ushort2uchar_many( void *out, void *in, size_t n);
-void ushort2uchar_sat_many( void *out, void *in, size_t n);
-void short2uchar_many( void *out, void *in, size_t n);
-void short2uchar_sat_many( void *out, void *in, size_t n);
-void uint2uchar_many( void *out, void *in, size_t n);
-void uint2uchar_sat_many( void *out, void *in, size_t n);
-void int2uchar_many( void *out, void *in, size_t n);
-void int2uchar_sat_many( void *out, void *in, size_t n);
-void float2uchar_many( void *out, void *in, size_t n);
-void float2uchar_sat_many( void *out, void *in, size_t n);
-void double2uchar_many( void *out, void *in, size_t n);
-void double2uchar_sat_many( void *out, void *in, size_t n);
-void ulong2uchar_many( void *out, void *in, size_t n);
-void ulong2uchar_sat_many( void *out, void *in, size_t n);
-void long2uchar_many( void *out, void *in, size_t n);
-void long2uchar_sat_many( void *out, void *in, size_t n);
-void uchar2char_many( void *out, void *in, size_t n);
-void uchar2char_sat_many( void *out, void *in, size_t n);
-void char2char_many( void *out, void *in, size_t n);
-void char2char_sat_many( void *out, void *in, size_t n);
-void ushort2char_many( void *out, void *in, size_t n);
-void ushort2char_sat_many( void *out, void *in, size_t n);
-void short2char_many( void *out, void *in, size_t n);
-void short2char_sat_many( void *out, void *in, size_t n);
-void uint2char_many( void *out, void *in, size_t n);
-void uint2char_sat_many( void *out, void *in, size_t n);
-void int2char_many( void *out, void *in, size_t n);
-void int2char_sat_many( void *out, void *in, size_t n);
-void float2char_many( void *out, void *in, size_t n);
-void float2char_sat_many( void *out, void *in, size_t n);
-void double2char_many( void *out, void *in, size_t n);
-void double2char_sat_many( void *out, void *in, size_t n);
-void ulong2char_many( void *out, void *in, size_t n);
-void ulong2char_sat_many( void *out, void *in, size_t n);
-void long2char_many( void *out, void *in, size_t n);
-void long2char_sat_many( void *out, void *in, size_t n);
-void uchar2ushort_many( void *out, void *in, size_t n);
-void uchar2ushort_sat_many( void *out, void *in, size_t n);
-void char2ushort_many( void *out, void *in, size_t n);
-void char2ushort_sat_many( void *out, void *in, size_t n);
-void ushort2ushort_many( void *out, void *in, size_t n);
-void ushort2ushort_sat_many( void *out, void *in, size_t n);
-void short2ushort_many( void *out, void *in, size_t n);
-void short2ushort_sat_many( void *out, void *in, size_t n);
-void uint2ushort_many( void *out, void *in, size_t n);
-void uint2ushort_sat_many( void *out, void *in, size_t n);
-void int2ushort_many( void *out, void *in, size_t n);
-void int2ushort_sat_many( void *out, void *in, size_t n);
-void float2ushort_many( void *out, void *in, size_t n);
-void float2ushort_sat_many( void *out, void *in, size_t n);
-void double2ushort_many( void *out, void *in, size_t n);
-void double2ushort_sat_many( void *out, void *in, size_t n);
-void ulong2ushort_many( void *out, void *in, size_t n);
-void ulong2ushort_sat_many( void *out, void *in, size_t n);
-void long2ushort_many( void *out, void *in, size_t n);
-void long2ushort_sat_many( void *out, void *in, size_t n);
-void uchar2short_many( void *out, void *in, size_t n);
-void uchar2short_sat_many( void *out, void *in, size_t n);
-void char2short_many( void *out, void *in, size_t n);
-void char2short_sat_many( void *out, void *in, size_t n);
-void ushort2short_many( void *out, void *in, size_t n);
-void ushort2short_sat_many( void *out, void *in, size_t n);
-void short2short_many( void *out, void *in, size_t n);
-void short2short_sat_many( void *out, void *in, size_t n);
-void uint2short_many( void *out, void *in, size_t n);
-void uint2short_sat_many( void *out, void *in, size_t n);
-void int2short_many( void *out, void *in, size_t n);
-void int2short_sat_many( void *out, void *in, size_t n);
-void float2short_many( void *out, void *in, size_t n);
-void float2short_sat_many( void *out, void *in, size_t n);
-void double2short_many( void *out, void *in, size_t n);
-void double2short_sat_many( void *out, void *in, size_t n);
-void ulong2short_many( void *out, void *in, size_t n);
-void ulong2short_sat_many( void *out, void *in, size_t n);
-void long2short_many( void *out, void *in, size_t n);
-void long2short_sat_many( void *out, void *in, size_t n);
-void uchar2uint_many( void *out, void *in, size_t n);
-void uchar2uint_sat_many( void *out, void *in, size_t n);
-void char2uint_many( void *out, void *in, size_t n);
-void char2uint_sat_many( void *out, void *in, size_t n);
-void ushort2uint_many( void *out, void *in, size_t n);
-void ushort2uint_sat_many( void *out, void *in, size_t n);
-void short2uint_many( void *out, void *in, size_t n);
-void short2uint_sat_many( void *out, void *in, size_t n);
-void uint2uint_many( void *out, void *in, size_t n);
-void uint2uint_sat_many( void *out, void *in, size_t n);
-void int2uint_many( void *out, void *in, size_t n);
-void int2uint_sat_many( void *out, void *in, size_t n);
-void float2uint_many( void *out, void *in, size_t n);
-void float2uint_sat_many( void *out, void *in, size_t n);
-void double2uint_many( void *out, void *in, size_t n);
-void double2uint_sat_many( void *out, void *in, size_t n);
-void ulong2uint_many( void *out, void *in, size_t n);
-void ulong2uint_sat_many( void *out, void *in, size_t n);
-void long2uint_many( void *out, void *in, size_t n);
-void long2uint_sat_many( void *out, void *in, size_t n);
-void uchar2int_many( void *out, void *in, size_t n);
-void uchar2int_sat_many( void *out, void *in, size_t n);
-void char2int_many( void *out, void *in, size_t n);
-void char2int_sat_many( void *out, void *in, size_t n);
-void ushort2int_many( void *out, void *in, size_t n);
-void ushort2int_sat_many( void *out, void *in, size_t n);
-void short2int_many( void *out, void *in, size_t n);
-void short2int_sat_many( void *out, void *in, size_t n);
-void uint2int_many( void *out, void *in, size_t n);
-void uint2int_sat_many( void *out, void *in, size_t n);
-void int2int_many( void *out, void *in, size_t n);
-void int2int_sat_many( void *out, void *in, size_t n);
-void float2int_many( void *out, void *in, size_t n);
-void float2int_sat_many( void *out, void *in, size_t n);
-void double2int_many( void *out, void *in, size_t n);
-void double2int_sat_many( void *out, void *in, size_t n);
-void ulong2int_many( void *out, void *in, size_t n);
-void ulong2int_sat_many( void *out, void *in, size_t n);
-void long2int_many( void *out, void *in, size_t n);
-void long2int_sat_many( void *out, void *in, size_t n);
-void uchar2float_many( void *out, void *in, size_t n);
-void uchar2float_sat_many( void *out, void *in, size_t n);
-void char2float_many( void *out, void *in, size_t n);
-void char2float_sat_many( void *out, void *in, size_t n);
-void ushort2float_many( void *out, void *in, size_t n);
-void ushort2float_sat_many( void *out, void *in, size_t n);
-void short2float_many( void *out, void *in, size_t n);
-void short2float_sat_many( void *out, void *in, size_t n);
-void uint2float_many( void *out, void *in, size_t n);
-void uint2float_sat_many( void *out, void *in, size_t n);
-void int2float_many( void *out, void *in, size_t n);
-void int2float_sat_many( void *out, void *in, size_t n);
-void float2float_many( void *out, void *in, size_t n);
-void float2float_sat_many( void *out, void *in, size_t n);
-void double2float_many( void *out, void *in, size_t n);
-void double2float_sat_many( void *out, void *in, size_t n);
-void ulong2float_many( void *out, void *in, size_t n);
-void ulong2float_sat_many( void *out, void *in, size_t n);
-void long2float_many( void *out, void *in, size_t n);
-void long2float_sat_many( void *out, void *in, size_t n);
-void uchar2double_many( void *out, void *in, size_t n);
-void uchar2double_sat_many( void *out, void *in, size_t n);
-void char2double_many( void *out, void *in, size_t n);
-void char2double_sat_many( void *out, void *in, size_t n);
-void ushort2double_many( void *out, void *in, size_t n);
-void ushort2double_sat_many( void *out, void *in, size_t n);
-void short2double_many( void *out, void *in, size_t n);
-void short2double_sat_many( void *out, void *in, size_t n);
-void uint2double_many( void *out, void *in, size_t n);
-void uint2double_sat_many( void *out, void *in, size_t n);
-void int2double_many( void *out, void *in, size_t n);
-void int2double_sat_many( void *out, void *in, size_t n);
-void float2double_many( void *out, void *in, size_t n);
-void float2double_sat_many( void *out, void *in, size_t n);
-void double2double_many( void *out, void *in, size_t n);
-void double2double_sat_many( void *out, void *in, size_t n);
-void ulong2double_many( void *out, void *in, size_t n);
-void ulong2double_sat_many( void *out, void *in, size_t n);
-void long2double_many( void *out, void *in, size_t n);
-void long2double_sat_many( void *out, void *in, size_t n);
-void uchar2ulong_many( void *out, void *in, size_t n);
-void uchar2ulong_sat_many( void *out, void *in, size_t n);
-void char2ulong_many( void *out, void *in, size_t n);
-void char2ulong_sat_many( void *out, void *in, size_t n);
-void ushort2ulong_many( void *out, void *in, size_t n);
-void ushort2ulong_sat_many( void *out, void *in, size_t n);
-void short2ulong_many( void *out, void *in, size_t n);
-void short2ulong_sat_many( void *out, void *in, size_t n);
-void uint2ulong_many( void *out, void *in, size_t n);
-void uint2ulong_sat_many( void *out, void *in, size_t n);
-void int2ulong_many( void *out, void *in, size_t n);
-void int2ulong_sat_many( void *out, void *in, size_t n);
-void float2ulong_many( void *out, void *in, size_t n);
-void float2ulong_sat_many( void *out, void *in, size_t n);
-void double2ulong_many( void *out, void *in, size_t n);
-void double2ulong_sat_many( void *out, void *in, size_t n);
-void ulong2ulong_many( void *out, void *in, size_t n);
-void ulong2ulong_sat_many( void *out, void *in, size_t n);
-void long2ulong_many( void *out, void *in, size_t n);
-void long2ulong_sat_many( void *out, void *in, size_t n);
-void uchar2long_many( void *out, void *in, size_t n);
-void uchar2long_sat_many( void *out, void *in, size_t n);
-void char2long_many( void *out, void *in, size_t n);
-void char2long_sat_many( void *out, void *in, size_t n);
-void ushort2long_many( void *out, void *in, size_t n);
-void ushort2long_sat_many( void *out, void *in, size_t n);
-void short2long_many( void *out, void *in, size_t n);
-void short2long_sat_many( void *out, void *in, size_t n);
-void uint2long_many( void *out, void *in, size_t n);
-void uint2long_sat_many( void *out, void *in, size_t n);
-void int2long_many( void *out, void *in, size_t n);
-void int2long_sat_many( void *out, void *in, size_t n);
-void float2long_many( void *out, void *in, size_t n);
-void float2long_sat_many( void *out, void *in, size_t n);
-void double2long_many( void *out, void *in, size_t n);
-void double2long_sat_many( void *out, void *in, size_t n);
-void ulong2long_many( void *out, void *in, size_t n);
-void ulong2long_sat_many( void *out, void *in, size_t n);
-void long2long_many( void *out, void *in, size_t n);
-void long2long_sat_many( void *out, void *in, size_t n);
+cl_int CustomConversionsTest::Run()
+{
+    int startMinVectorSize = gMinVectorSize;
+    Type inType, outType;
+    RoundingMode round;
+    SaturationMode sat;
 
-void uchar2uchar_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_uchar )); }
-void uchar2uchar_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_uchar )); }
-void char2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_char)); }}
-void char2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_char)); }}
-void ushort2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_ushort)); }}
-void short2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_short)); }}
-void short2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_short)); }}
-void uint2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_uint)); }}
-void uint2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_uint)); }}
-void int2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_int)); }}
-void int2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_int)); }}
-void float2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_float)); }}
-void float2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_float)); }}
-void double2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_double)); }}
-void double2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_double)); }}
-void ulong2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_ulong)); }}
-void long2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_long)); }}
-void long2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_long)); }}
-void uchar2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_uchar)); }}
-void char2char_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_char )); }
-void char2char_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_char )); }
-void ushort2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_ushort)); }}
-void short2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_short)); }}
-void short2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_short)); }}
-void uint2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_uint)); }}
-void uint2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_uint)); }}
-void int2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_int)); }}
-void int2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_int)); }}
-void float2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_float)); }}
-void float2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_float)); }}
-void double2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_double)); }}
-void double2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_double)); }}
-void ulong2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_ulong)); }}
-void long2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_long)); }}
-void long2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_long)); }}
-void uchar2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_uchar)); }}
-void char2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_char)); }}
-void char2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_char)); }}
-void ushort2ushort_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_ushort )); }
-void ushort2ushort_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_ushort )); }
-void short2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_short)); }}
-void short2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_short)); }}
-void uint2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_uint)); }}
-void uint2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_uint)); }}
-void int2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_int)); }}
-void int2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_int)); }}
-void float2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_float)); }}
-void float2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_float)); }}
-void double2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_double)); }}
-void double2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_double)); }}
-void ulong2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_ulong)); }}
-void long2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_long)); }}
-void long2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_long)); }}
-void uchar2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_uchar)); }}
-void char2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_char)); }}
-void char2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_char)); }}
-void ushort2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_ushort)); }}
-void short2short_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_short )); }
-void short2short_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_short )); }
-void uint2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_uint)); }}
-void uint2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_uint)); }}
-void int2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_int)); }}
-void int2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_int)); }}
-void float2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_float)); }}
-void float2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_float)); }}
-void double2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_double)); }}
-void double2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_double)); }}
-void ulong2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_ulong)); }}
-void long2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_long)); }}
-void long2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_long)); }}
-void uchar2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_uchar)); }}
-void char2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_char)); }}
-void char2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_char)); }}
-void ushort2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_ushort)); }}
-void short2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_short)); }}
-void short2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_short)); }}
-void uint2uint_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_uint )); }
-void uint2uint_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_uint )); }
-void int2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_int)); }}
-void int2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_int)); }}
-void float2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_float)); }}
-void float2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_float)); }}
-void double2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_double)); }}
-void double2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_double)); }}
-void ulong2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_ulong)); }}
-void long2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_long)); }}
-void long2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_long)); }}
-void uchar2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_uchar)); }}
-void char2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_char)); }}
-void char2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_char)); }}
-void ushort2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_ushort)); }}
-void short2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_short)); }}
-void short2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_short)); }}
-void uint2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_uint)); }}
-void uint2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_uint)); }}
-void int2int_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_int )); }
-void int2int_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_int )); }
-void float2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_float)); }}
-void float2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_float)); }}
-void double2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_double)); }}
-void double2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_double)); }}
-void ulong2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_ulong)); }}
-void long2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_long)); }}
-void long2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_long)); }}
-void uchar2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_uchar)); }}
-void char2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_char)); }}
-void char2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_char)); }}
-void ushort2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_ushort)); }}
-void short2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_short)); }}
-void short2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_short)); }}
-void uint2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_uint)); }}
-void uint2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_uint)); }}
-void int2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_int)); }}
-void int2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_int)); }}
-void float2float_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_float )); }
-void float2float_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_float )); }
-void double2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_double)); }}
-void double2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_double)); }}
-void ulong2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_ulong)); }}
-void long2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_long)); }}
-void long2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_long)); }}
-void uchar2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_uchar)); }}
-void char2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_char)); }}
-void char2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_char)); }}
-void ushort2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_ushort)); }}
-void short2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_short)); }}
-void short2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_short)); }}
-void uint2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_uint)); }}
-void uint2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_uint)); }}
-void int2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_int)); }}
-void int2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_int)); }}
-void float2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_float)); }}
-void float2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_float)); }}
-void double2double_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_double )); }
-void double2double_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_double )); }
-void ulong2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_ulong)); }}
-void long2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_long)); }}
-void long2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_long)); }}
-void uchar2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_uchar)); }}
-void char2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_char)); }}
-void char2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_char)); }}
-void ushort2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_ushort)); }}
-void short2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_short)); }}
-void short2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_short)); }}
-void uint2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_uint)); }}
-void uint2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_uint)); }}
-void int2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_int)); }}
-void int2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_int)); }}
-void float2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_float)); }}
-void float2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_float)); }}
-void double2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_double)); }}
-void double2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_double)); }}
-void ulong2ulong_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_ulong )); }
-void ulong2ulong_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_ulong )); }
-void long2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_long)); }}
-void long2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_long)); }}
-void uchar2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_uchar)); }}
-void char2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_char)); }}
-void char2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_char)); }}
-void ushort2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_ushort)); }}
-void short2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_short)); }}
-void short2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_short)); }}
-void uint2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_uint)); }}
-void uint2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_uint)); }}
-void int2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_int)); }}
-void int2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_int)); }}
-void float2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_float)); }}
-void float2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_float)); }}
-void double2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_double)); }}
-void double2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_double)); }}
-void ulong2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_ulong)); }}
-void long2long_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_long )); }
-void long2long_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_long )); }
+    for (int i = 0; i < argCount; i++)
+    {
+        if (conv_test::GetTestCase(argList[i], &outType, &inType, &sat, &round))
+        {
+            vlog_error("\n\t\t**** ERROR:  Unable to parse function name "
+                       "%s.  Skipping....  *****\n\n",
+                       argList[i]);
+            continue;
+        }
 
-Convert gSaturatedConversions[kTypeCount][kTypeCount] = {
-    {    uchar2uchar_sat_many,    char2uchar_sat_many,    ushort2uchar_sat_many,    short2uchar_sat_many,    uint2uchar_sat_many,    int2uchar_sat_many,    float2uchar_sat_many,    double2uchar_sat_many,    ulong2uchar_sat_many,    long2uchar_sat_many,     },
-    {    uchar2char_sat_many,    char2char_sat_many,    ushort2char_sat_many,    short2char_sat_many,    uint2char_sat_many,    int2char_sat_many,    float2char_sat_many,    double2char_sat_many,    ulong2char_sat_many, long2char_sat_many,     },
-    {    uchar2ushort_sat_many,    char2ushort_sat_many,    ushort2ushort_sat_many,    short2ushort_sat_many,    uint2ushort_sat_many,    int2ushort_sat_many,    float2ushort_sat_many,    double2ushort_sat_many,    ulong2ushort_sat_many,    long2ushort_sat_many,     },
-    {    uchar2short_sat_many,    char2short_sat_many,    ushort2short_sat_many,    short2short_sat_many,    uint2short_sat_many,    int2short_sat_many,    float2short_sat_many,    double2short_sat_many,    ulong2short_sat_many,    long2short_sat_many,     },
-    {    uchar2uint_sat_many,    char2uint_sat_many,    ushort2uint_sat_many,    short2uint_sat_many,    uint2uint_sat_many,    int2uint_sat_many,    float2uint_sat_many,    double2uint_sat_many,    ulong2uint_sat_many, long2uint_sat_many,     },
-    {    uchar2int_sat_many,    char2int_sat_many,    ushort2int_sat_many,    short2int_sat_many,    uint2int_sat_many,    int2int_sat_many,    float2int_sat_many,    double2int_sat_many,    ulong2int_sat_many,long2int_sat_many,     },
-    {    uchar2float_sat_many,    char2float_sat_many,    ushort2float_sat_many,    short2float_sat_many,    uint2float_sat_many,    int2float_sat_many,    float2float_sat_many,    double2float_sat_many,    ulong2float_sat_many,    long2float_sat_many,     },
-    {    uchar2double_sat_many,    char2double_sat_many,    ushort2double_sat_many,    short2double_sat_many,    uint2double_sat_many,    int2double_sat_many,    float2double_sat_many,    double2double_sat_many,    ulong2double_sat_many,    long2double_sat_many,     },
-    {    uchar2ulong_sat_many,    char2ulong_sat_many,    ushort2ulong_sat_many,    short2ulong_sat_many,    uint2ulong_sat_many,    int2ulong_sat_many,    float2ulong_sat_many,    double2ulong_sat_many,    ulong2ulong_sat_many,    long2ulong_sat_many,     },
-    {    uchar2long_sat_many,    char2long_sat_many,    ushort2long_sat_many,    short2long_sat_many,    uint2long_sat_many,    int2long_sat_many,    float2long_sat_many,    double2long_sat_many,    ulong2long_sat_many, long2long_sat_many,     },
-};
+        // skip double if we don't have it
+        if (!gTestDouble && (inType == kdouble || outType == kdouble))
+        {
+            if (gHasDouble)
+            {
+                vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
+                           gTypeNames[outType], gSaturationNames[sat],
+                           gRoundingModeNames[round], gTypeNames[inType]);
+                vlog("\t\tcl_khr_fp64 enabled, but double testing turned "
+                     "off.\n");
+            }
+            continue;
+        }
 
-Convert gConversions[kTypeCount][kTypeCount] = {
-    {    uchar2uchar_many,    char2uchar_many,    ushort2uchar_many,    short2uchar_many,    uint2uchar_many,    int2uchar_many,    float2uchar_many,    double2uchar_many,    ulong2uchar_many,    long2uchar_many,     },
-    {    uchar2char_many,    char2char_many,    ushort2char_many,    short2char_many,    uint2char_many,    int2char_many,    float2char_many,    double2char_many,    ulong2char_many,    long2char_many,     },
-    {    uchar2ushort_many,    char2ushort_many,    ushort2ushort_many,    short2ushort_many,    uint2ushort_many,    int2ushort_many,    float2ushort_many,    double2ushort_many,    ulong2ushort_many,    long2ushort_many,     },
-    {    uchar2short_many,    char2short_many,    ushort2short_many,    short2short_many,    uint2short_many,    int2short_many,    float2short_many,    double2short_many,    ulong2short_many,    long2short_many,     },
-    {    uchar2uint_many,    char2uint_many,    ushort2uint_many,    short2uint_many,    uint2uint_many,    int2uint_many,    float2uint_many,    double2uint_many,    ulong2uint_many,    long2uint_many,     },
-    {    uchar2int_many,    char2int_many,    ushort2int_many,    short2int_many,    uint2int_many,    int2int_many,    float2int_many,    double2int_many,    ulong2int_many,    long2int_many,     },
-    {    uchar2float_many,    char2float_many,    ushort2float_many,    short2float_many,    uint2float_many,    int2float_many,    float2float_many,    double2float_many,    ulong2float_many,    long2float_many,     },
-    {    uchar2double_many,    char2double_many,    ushort2double_many,    short2double_many,    uint2double_many,    int2double_many,    float2double_many,    double2double_many,    ulong2double_many,    long2double_many,     },
-    {    uchar2ulong_many,    char2ulong_many,    ushort2ulong_many,    short2ulong_many,    uint2ulong_many,    int2ulong_many,    float2ulong_many,    double2ulong_many,    ulong2ulong_many,    long2ulong_many,     },
-    {    uchar2long_many,    char2long_many,    ushort2long_many,    short2long_many,    uint2long_many,    int2long_many,    float2long_many,    double2long_many,    ulong2long_many,    long2long_many,     },
-};
+        // skip longs on embedded
+        if (!gHasLong
+            && (inType == klong || outType == klong || inType == kulong
+                || outType == kulong))
+        {
+            continue;
+        }
+
+        // Skip the implicit converts if the rounding mode is not default or
+        // test is saturated
+        if (0 == startMinVectorSize)
+        {
+            if (sat || round != kDefaultRoundingMode)
+                gMinVectorSize = 1;
+            else
+                gMinVectorSize = 0;
+        }
+
+        IterOverSelectedTypes iter(typeIterator, *this, inType, outType, round,
+                                   sat);
+
+        iter.Run();
+
+        if (gFailCount)
+        {
+            vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
+                       gTypeNames[outType], gSaturationNames[sat],
+                       gRoundingModeNames[round], gTypeNames[inType]);
+        }
+    }
+
+    return gFailCount;
+}
+
+
+ConversionsTest::ConversionsTest(cl_device_id device, cl_context context,
+                                 cl_command_queue queue)
+    : context(context), device(device), queue(queue), num_elements(0),
+      typeIterator({ cl_uchar(0), cl_char(0), cl_ushort(0), cl_short(0),
+                     cl_uint(0), cl_int(0), cl_float(0), cl_double(0),
+                     cl_ulong(0), cl_long(0) })
+{}
+
+
+cl_int ConversionsTest::Run()
+{
+    IterOverTypes iter(typeIterator, *this);
+
+    iter.Run();
+
+    return gFailCount;
+}
+
+
+cl_int ConversionsTest::SetUp(int elements)
+{
+    num_elements = elements;
+    return CL_SUCCESS;
+}
+
+
+template <typename InType, typename OutType>
+void ConversionsTest::TestTypesConversion(const Type &inType,
+                                          const Type &outType, int &testNumber,
+                                          int startMinVectorSize)
+{
+    SaturationMode sat;
+    RoundingMode round;
+    int error;
+
+    // skip longs on embedded
+    if (!gHasLong
+        && (inType == klong || outType == klong || inType == kulong
+            || outType == kulong))
+    {
+        return;
+    }
+
+    for (sat = (SaturationMode)0; sat < kSaturationModeCount;
+         sat = (SaturationMode)(sat + 1))
+    {
+        // skip illegal saturated conversions to float type
+        if (kSaturated == sat && (outType == kfloat || outType == kdouble))
+        {
+            continue;
+        }
+
+        for (round = (RoundingMode)0; round < kRoundingModeCount;
+             round = (RoundingMode)(round + 1))
+        {
+            if (++testNumber < gStartTestNumber)
+            {
+                continue;
+            }
+            else
+            {
+                if (gEndTestNumber > 0 && testNumber >= gEndTestNumber) return;
+            }
+
+            vlog("%d) Testing convert_%sn%s%s( %sn ):\n", testNumber,
+                 gTypeNames[outType], gSaturationNames[sat],
+                 gRoundingModeNames[round], gTypeNames[inType]);
+
+            // skip double if we don't have it
+            if (!gTestDouble && (inType == kdouble || outType == kdouble))
+            {
+                if (gHasDouble)
+                {
+                    vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
+                               "FAILED ** \n",
+                               testNumber, gTypeNames[outType],
+                               gSaturationNames[sat], gRoundingModeNames[round],
+                               gTypeNames[inType]);
+                    vlog("\t\tcl_khr_fp64 enabled, but double "
+                         "testing turned off.\n");
+                }
+                continue;
+            }
+
+            // Skip the implicit converts if the rounding mode is
+            // not default or test is saturated
+            if (0 == startMinVectorSize)
+            {
+                if (sat || round != kDefaultRoundingMode)
+                    gMinVectorSize = 1;
+                else
+                    gMinVectorSize = 0;
+            }
+
+            if ((error = DoTest<InType, OutType>(outType, inType, sat, round)))
+            {
+                vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
+                           "FAILED ** \n",
+                           testNumber, gTypeNames[outType],
+                           gSaturationNames[sat], gRoundingModeNames[round],
+                           gTypeNames[inType]);
+            }
+        }
+    }
+}
+
+
+template <typename InType, typename OutType>
+int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
+                            RoundingMode round)
+{
+#ifdef __APPLE__
+    cl_ulong wall_start = mach_absolute_time();
+#endif
+
+    cl_uint threads = GetThreadCount();
+
+    DataInitInfo info = { 0, 0, outType, inType, sat, round, threads };
+    DataInfoSpec<InType, OutType> init_info(info);
+    WriteInputBufferInfo writeInputBufferInfo;
+    int vectorSize;
+    int error = 0;
+    uint64_t i;
+
+    gTestCount++;
+    size_t blockCount =
+        BUFFER_SIZE / std::max(gTypeSizes[inType], gTypeSizes[outType]);
+    size_t step = blockCount;
+
+    for (i = 0; i < threads; i++)
+    {
+        init_info.mdv.emplace_back(MTdataHolder(gRandomSeed));
+    }
+
+    writeInputBufferInfo.outType = outType;
+    writeInputBufferInfo.inType = inType;
+
+    writeInputBufferInfo.calcInfo.resize(gMaxVectorSize);
+    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
+    {
+        writeInputBufferInfo.calcInfo[vectorSize].reset(
+            new CalcRefValsPat<InType, OutType>());
+        writeInputBufferInfo.calcInfo[vectorSize]->program =
+            conv_test::MakeProgram(
+                outType, inType, sat, round, vectorSize,
+                &writeInputBufferInfo.calcInfo[vectorSize]->kernel);
+        if (NULL == writeInputBufferInfo.calcInfo[vectorSize]->program)
+        {
+            gFailCount++;
+            return -1;
+        }
+        if (NULL == writeInputBufferInfo.calcInfo[vectorSize]->kernel)
+        {
+            gFailCount++;
+            vlog_error("\t\tFAILED -- Failed to create kernel.\n");
+            return -2;
+        }
+
+        writeInputBufferInfo.calcInfo[vectorSize]->parent =
+            &writeInputBufferInfo;
+        writeInputBufferInfo.calcInfo[vectorSize]->vectorSize = vectorSize;
+        writeInputBufferInfo.calcInfo[vectorSize]->result = -1;
+    }
+
+    if (gSkipTesting) return error;
+
+    // Patch up rounding mode if default is RTZ
+    // We leave the part above in default rounding mode so that the right kernel
+    // is compiled.
+    if (std::is_same<OutType, cl_float>::value)
+    {
+        if (round == kDefaultRoundingMode && gIsRTZ)
+            init_info.round = round = kRoundTowardZero;
+    }
+
+    // Figure out how many elements are in a work block
+    // we handle 64-bit types a bit differently.
+    uint64_t lastCase = (8 * gTypeSizes[inType] > 32)
+        ? 0x100000000ULL
+        : 1ULL << (8 * gTypeSizes[inType]);
+
+    if (!gWimpyMode && gIsEmbedded)
+        step = blockCount * EMBEDDED_REDUCTION_FACTOR;
+
+    if (gWimpyMode) step = (size_t)blockCount * (size_t)gWimpyReductionFactor;
+    vlog("Testing... ");
+    fflush(stdout);
+    for (i = 0; i < (uint64_t)lastCase; i += step)
+    {
+
+        if (0 == (i & ((lastCase >> 3) - 1)))
+        {
+            vlog(".");
+            fflush(stdout);
+        }
+
+        cl_uint count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
+        writeInputBufferInfo.count = count;
+
+        // Crate a user event to represent the status of the reference value
+        // computation completion
+        writeInputBufferInfo.calcReferenceValues =
+            clCreateUserEvent(gContext, &error);
+        if (error || NULL == writeInputBufferInfo.calcReferenceValues)
+        {
+            vlog_error("ERROR: Unable to create user event. (%d)\n", error);
+            gFailCount++;
+            return error;
+        }
+
+        // retain for consumption by MapOutputBufferComplete
+        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
+             vectorSize++)
+        {
+            if ((error =
+                     clRetainEvent(writeInputBufferInfo.calcReferenceValues)))
+            {
+                vlog_error("ERROR: Unable to retain user event. (%d)\n", error);
+                gFailCount++;
+                return error;
+            }
+        }
+
+        // Crate a user event to represent when the callbacks are done verifying
+        // correctness
+        writeInputBufferInfo.doneBarrier = clCreateUserEvent(gContext, &error);
+        if (error || NULL == writeInputBufferInfo.doneBarrier)
+        {
+            vlog_error("ERROR: Unable to create user event for barrier. (%d)\n",
+                       error);
+            gFailCount++;
+            return error;
+        }
+
+        // retain for use by the callback that calls this
+        if ((error = clRetainEvent(writeInputBufferInfo.doneBarrier)))
+        {
+            vlog_error("ERROR: Unable to retain user event doneBarrier. (%d)\n",
+                       error);
+            gFailCount++;
+            return error;
+        }
+
+        //      Call this in a multithreaded manner
+        cl_uint chunks = RoundUpToNextPowerOfTwo(threads) * 2;
+        init_info.start = i;
+        init_info.size = count / chunks;
+        if (init_info.size < 16384)
+        {
+            chunks = RoundUpToNextPowerOfTwo(threads);
+            init_info.size = count / chunks;
+            if (init_info.size < 16384)
+            {
+                init_info.size = count;
+                chunks = 1;
+            }
+        }
+
+        ThreadPool_Do(conv_test::InitData, chunks, &init_info);
+
+        // Copy the results to the device
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
+                                          count * gTypeSizes[inType], gIn, 0,
+                                          NULL, NULL)))
+        {
+            vlog_error("ERROR: clEnqueueWriteBuffer failed. (%d)\n", error);
+            gFailCount++;
+            return error;
+        }
+
+        // Call completion callback for the write, which will enqueue the rest
+        // of the work.
+        conv_test::WriteInputBufferComplete((void *)&writeInputBufferInfo);
+
+        // Make sure the work is actually running, so we don't deadlock
+        if ((error = clFlush(gQueue)))
+        {
+            vlog_error("clFlush failed with error %d\n", error);
+            gFailCount++;
+            return error;
+        }
+
+        ThreadPool_Do(conv_test::PrepareReference, chunks, &init_info);
+
+        // signal we are done calculating the reference results
+        if ((error = clSetUserEventStatus(
+                 writeInputBufferInfo.calcReferenceValues, CL_COMPLETE)))
+        {
+            vlog_error(
+                "Error:  Failed to set user event status to CL_COMPLETE:  %d\n",
+                error);
+            gFailCount++;
+            return error;
+        }
+
+        // Wait for the event callbacks to finish verifying correctness.
+        if ((error = clWaitForEvents(
+                 1, (cl_event *)&writeInputBufferInfo.doneBarrier)))
+        {
+            vlog_error("Error:  Failed to wait for barrier:  %d\n", error);
+            gFailCount++;
+            return error;
+        }
+
+        if ((error = clReleaseEvent(writeInputBufferInfo.calcReferenceValues)))
+        {
+            vlog_error("Error:  Failed to release calcReferenceValues:  %d\n",
+                       error);
+            gFailCount++;
+            return error;
+        }
+
+        if ((error = clReleaseEvent(writeInputBufferInfo.doneBarrier)))
+        {
+            vlog_error("Error:  Failed to release done barrier:  %d\n", error);
+            gFailCount++;
+            return error;
+        }
+
+        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
+             vectorSize++)
+        {
+            if ((error = writeInputBufferInfo.calcInfo[vectorSize]->result))
+            {
+                switch (inType)
+                {
+                    case kuchar:
+                    case kchar:
+                        vlog("Input value: 0x%2.2x ",
+                             ((unsigned char *)gIn)[error - 1]);
+                        break;
+                    case kushort:
+                    case kshort:
+                        vlog("Input value: 0x%4.4x ",
+                             ((unsigned short *)gIn)[error - 1]);
+                        break;
+                    case kuint:
+                    case kint:
+                        vlog("Input value: 0x%8.8x ",
+                             ((unsigned int *)gIn)[error - 1]);
+                        break;
+                    case kfloat:
+                        vlog("Input value: %a ", ((float *)gIn)[error - 1]);
+                        break;
+                    case kulong:
+                    case klong:
+                        vlog("Input value: 0x%16.16llx ",
+                             ((unsigned long long *)gIn)[error - 1]);
+                        break;
+                    case kdouble:
+                        vlog("Input value: %a ", ((double *)gIn)[error - 1]);
+                        break;
+                    default:
+                        vlog_error("Internal error at %s: %d\n", __FILE__,
+                                   __LINE__);
+                        abort();
+                        break;
+                }
+
+                // tell the user which conversion it was.
+                if (0 == vectorSize)
+                    vlog(" (implicit scalar conversion from %s to %s)\n",
+                         gTypeNames[inType], gTypeNames[outType]);
+                else
+                    vlog(" (convert_%s%s%s%s( %s%s ))\n", gTypeNames[outType],
+                         sizeNames[vectorSize], gSaturationNames[sat],
+                         gRoundingModeNames[round], gTypeNames[inType],
+                         sizeNames[vectorSize]);
+
+                gFailCount++;
+                return error;
+            }
+        }
+    }
+
+    log_info("done.\n");
+
+    if (gTimeResults)
+    {
+        // Kick off tests for the various vector lengths
+        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
+             vectorSize++)
+        {
+            size_t workItemCount = blockCount / vectorSizes[vectorSize];
+            if (vectorSizes[vectorSize] * gTypeSizes[outType] < 4)
+                workItemCount /=
+                    4 / (vectorSizes[vectorSize] * gTypeSizes[outType]);
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            cl_uint k;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = conv_test::GetTime();
+                if ((error = conv_test::RunKernel(
+                         writeInputBufferInfo.calcInfo[vectorSize]->kernel,
+                         gInBuffer, gOutBuffers[vectorSize], workItemCount)))
+                {
+                    gFailCount++;
+                    return error;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    return error;
+                }
+
+                uint64_t endTime = conv_test::GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (workItemCount * vectorSizes[vectorSize]);
+            if (0 == vectorSize)
+                vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element",
+                          "implicit convert %s -> %s", gTypeNames[inType],
+                          gTypeNames[outType]);
+            else
+                vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element",
+                          "convert_%s%s%s%s( %s%s )", gTypeNames[outType],
+                          sizeNames[vectorSize], gSaturationNames[sat],
+                          gRoundingModeNames[round], gTypeNames[inType],
+                          sizeNames[vectorSize]);
+        }
+    }
+
+    if (gWimpyMode)
+        vlog("\tWimp pass");
+    else
+        vlog("\tpassed");
+
+#ifdef __APPLE__
+    // record the run time
+    vlog("\t(%f s)", 1e-9 * (mach_absolute_time() - wall_start));
+#endif
+    vlog("\n\n");
+    fflush(stdout);
+
+    return error;
+}
+
+#if !defined(__APPLE__)
+void memset_pattern4(void *dest, const void *src_pattern, size_t bytes);
+#endif
+
+#if defined(_MSC_VER)
+/* function is defined in "compat.h" */
+#else
+double SubtractTime(uint64_t endTime, uint64_t startTime)
+{
+    uint64_t diff = endTime - startTime;
+    static double conversion = 0.0;
+
+    if (0.0 == conversion)
+    {
+#if defined(__APPLE__)
+        mach_timebase_info_data_t info = { 0, 0 };
+        kern_return_t err = mach_timebase_info(&info);
+        if (0 == err)
+            conversion = 1e-9 * (double)info.numer / (double)info.denom;
+#else
+        // This function consumes output from GetTime() above, and converts the
+        // time to secionds.
+#warning need accurate ticks to seconds conversion factor here. Times are invalid.
+#endif
+    }
+
+    // strictly speaking we should also be subtracting out timer latency here
+    return conversion * (double)diff;
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count)
+{
+    cl_uint i;
+    for (i = 0; i < count; ++i)
+        allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0);
+}
+
+
+void MapResultValuesComplete(const std::unique_ptr<CalcRefValsBase> &ptr);
+
+void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
+                                             void *data);
+
+// Note: May be called reentrantly
+void MapResultValuesComplete(const std::unique_ptr<CalcRefValsBase> &info)
+{
+    cl_int status;
+    // CalcRefValsBase *info = (CalcRefValsBase *)data;
+    cl_event calcReferenceValues = info->parent->calcReferenceValues;
+
+    // we know that the map is done, wait for the main thread to finish
+    // calculating the reference values
+    if ((status =
+             clSetEventCallback(calcReferenceValues, CL_COMPLETE,
+                                CalcReferenceValuesComplete, (void *)&info)))
+    {
+        vlog_error("ERROR: clSetEventCallback failed in "
+                   "MapResultValuesComplete with status: %d\n",
+                   status);
+        gFailCount++; // not thread safe -- being lazy here
+    }
+
+    // this thread no longer needs its reference to info->calcReferenceValues,
+    // so release it
+    if ((status = clReleaseEvent(calcReferenceValues)))
+    {
+        vlog_error("ERROR: clReleaseEvent(info->calcReferenceValues) failed "
+                   "with status: %d\n",
+                   status);
+        gFailCount++; // not thread safe -- being lazy here
+    }
+
+    // no need to flush since we didn't enqueue anything
+
+    // e was already released by WriteInputBufferComplete. It should be
+    // destroyed automatically soon after we exit.
+}
+
+
+void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
+                                             void *data)
+{
+    std::unique_ptr<CalcRefValsBase> &info =
+        *(std::unique_ptr<CalcRefValsBase> *)data;
+
+    cl_uint vectorSize = info->vectorSize;
+    cl_uint count = info->parent->count;
+    Type outType =
+        info->parent->outType; // the data type of the conversion result
+    Type inType = info->parent->inType; // the data type of the conversion input
+    size_t j;
+    cl_int error;
+    cl_event doneBarrier = info->parent->doneBarrier;
+
+    // report spurious error condition
+    if (CL_SUCCESS != status)
+    {
+        vlog_error("ERROR: CalcReferenceValuesComplete did not succeed! (%d)\n",
+                   status);
+        gFailCount++; // lazy about thread safety here
+        return;
+    }
+
+    // Now we know that both results have been mapped back from the device, and
+    // the main thread is done calculating the reference results. It is now time
+    // to check the results.
+
+    // verify results
+    void *mapped = info->p;
+
+    // Patch up NaNs conversions to integer to zero -- these can be converted to
+    // any integer
+    if (outType != kfloat && outType != kdouble)
+    {
+        if (inType == kfloat)
+        {
+            float *inp = (float *)gIn;
+            for (j = 0; j < count; j++)
+            {
+                if (isnan(inp[j]))
+                    memset((char *)mapped + j * gTypeSizes[outType], 0,
+                           gTypeSizes[outType]);
+            }
+        }
+        if (inType == kdouble)
+        {
+            double *inp = (double *)gIn;
+            for (j = 0; j < count; j++)
+            {
+                if (isnan(inp[j]))
+                    memset((char *)mapped + j * gTypeSizes[outType], 0,
+                           gTypeSizes[outType]);
+            }
+        }
+    }
+    else if (inType == kfloat || inType == kdouble)
+    { // outtype and intype is float or double.  NaN conversions for float <->
+      // double can be any NaN
+        if (inType == kfloat && outType == kdouble)
+        {
+            float *inp = (float *)gIn;
+            double *outp = (double *)mapped;
+            for (j = 0; j < count; j++)
+            {
+                if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
+            }
+        }
+        if (inType == kdouble && outType == kfloat)
+        {
+            double *inp = (double *)gIn;
+            float *outp = (float *)mapped;
+            for (j = 0; j < count; j++)
+            {
+                if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
+            }
+        }
+    }
+
+    if (memcmp(mapped, gRef, count * gTypeSizes[outType]))
+        info->result =
+            info->check_result(mapped, count, vectorSizes[vectorSize]);
+    else
+        info->result = 0;
+
+    // Fill the output buffer with junk and release it
+    {
+        cl_uint pattern = 0xffffdead;
+        memset_pattern4(mapped, &pattern, count * gTypeSizes[outType]);
+        if ((error = clEnqueueUnmapMemObject(gQueue, gOutBuffers[vectorSize],
+                                             mapped, 0, NULL, NULL)))
+        {
+            vlog_error("ERROR: clEnqueueUnmapMemObject failed in "
+                       "CalcReferenceValuesComplete  (%d)\n",
+                       error);
+            gFailCount++;
+        }
+    }
+
+    if (1 == ThreadPool_AtomicAdd(&info->parent->barrierCount, -1))
+    {
+        if ((status = clSetUserEventStatus(doneBarrier, CL_COMPLETE)))
+        {
+            vlog_error("ERROR: clSetUserEventStatus failed in "
+                       "CalcReferenceValuesComplete (err: %d). We're probably "
+                       "going to deadlock.\n",
+                       status);
+            gFailCount++;
+            return;
+        }
+
+        if ((status = clReleaseEvent(doneBarrier)))
+        {
+            vlog_error("ERROR: clReleaseEvent failed in "
+                       "CalcReferenceValuesComplete (err: %d).\n",
+                       status);
+            gFailCount++;
+            return;
+        }
+    }
+    // e was already released by WriteInputBufferComplete. It should be
+    // destroyed automatically soon after all the calls to
+    // CalcReferenceValuesComplete exit.
+}
+
+//
+
+namespace conv_test {
+
+////////////////////////////////////////////////////////////////////////////////
+
+cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p)
+{
+    DataInitBase *info = (DataInitBase *)p;
+
+    info->init(job_id, thread_id);
+
+    return CL_SUCCESS;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
+{
+    DataInitBase *info = (DataInitBase *)p;
+
+    cl_uint count = info->size;
+    Type inType = info->inType;
+    Type outType = info->outType;
+    RoundingMode round = info->round;
+    size_t j;
+
+    Force64BitFPUPrecision();
+
+    void *s = (cl_uchar *)gIn + job_id * count * gTypeSizes[info->inType];
+    void *a = (cl_uchar *)gAllowZ + job_id * count;
+    void *d = (cl_uchar *)gRef + job_id * count * gTypeSizes[info->outType];
+
+
+    if (outType != inType)
+    {
+        // create the reference while we wait
+#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
+        /* ARM VFP doesn't have hardware instruction for converting from 64-bit
+         * integer to float types, hence GCC ARM uses the floating-point
+         * emulation code despite which -mfloat-abi setting it is. But the
+         * emulation code in libgcc.a has only one rounding mode (round to
+         * nearest even in this case) and ignores the user rounding mode setting
+         * in hardware. As a result setting rounding modes in hardware won't
+         * give correct rounding results for type covert from 64-bit integer to
+         * float using GCC for ARM compiler so for testing different rounding
+         * modes, we need to use alternative reference function. ARM64 does have
+         * an instruction, however we cannot guarantee the compiler will use it.
+         * On all ARM architechures use emulation to calculate reference.*/
+        switch (round)
+        {
+            /* conversions to floating-point type use the current rounding mode.
+             * The only default floating-point rounding mode supported is round
+             * to nearest even i.e the current rounding mode will be _rte for
+             * floating-point types. */
+            case kDefaultRoundingMode: qcom_rm = qcomRTE; break;
+            case kRoundToNearestEven: qcom_rm = qcomRTE; break;
+            case kRoundUp: qcom_rm = qcomRTP; break;
+            case kRoundDown: qcom_rm = qcomRTN; break;
+            case kRoundTowardZero: qcom_rm = qcomRTZ; break;
+            default:
+                vlog_error("ERROR: undefined rounding mode %d\n", round);
+                break;
+        }
+        qcom_sat = info->sat;
+#endif
+
+        RoundingMode oldRound = set_round(round, outType);
+
+        if (info->sat)
+            info->conv_array_sat(d, s, count);
+        else
+            info->conv_array(d, s, count);
+
+        set_round(oldRound, outType);
+
+        // Decide if we allow a zero result in addition to the correctly rounded
+        // one
+        memset(a, 0, count);
+        if (gForceFTZ)
+        {
+            if (inType == kfloat || outType == kfloat)
+                setAllowZ((uint8_t *)a, (uint32_t *)s, count);
+        }
+    }
+    else
+    {
+        // Copy the input to the reference
+        memcpy(d, s, info->size * gTypeSizes[inType]);
+    }
+
+    // Patch up NaNs conversions to integer to zero -- these can be converted to
+    // any integer
+    if (info->outType != kfloat && info->outType != kdouble)
+    {
+        if (inType == kfloat)
+        {
+            float *inp = (float *)s;
+            for (j = 0; j < count; j++)
+            {
+                if (isnan(inp[j]))
+                    memset((char *)d + j * gTypeSizes[outType], 0,
+                           gTypeSizes[outType]);
+            }
+        }
+        if (inType == kdouble)
+        {
+            double *inp = (double *)s;
+            for (j = 0; j < count; j++)
+            {
+                if (isnan(inp[j]))
+                    memset((char *)d + j * gTypeSizes[outType], 0,
+                           gTypeSizes[outType]);
+            }
+        }
+    }
+    else if (inType == kfloat || inType == kdouble)
+    { // outtype and intype is float or double.  NaN conversions for float <->
+      // double can be any NaN
+        if (inType == kfloat && outType == kdouble)
+        {
+            float *inp = (float *)s;
+            for (j = 0; j < count; j++)
+            {
+                if (isnan(inp[j])) ((double *)d)[j] = NAN;
+            }
+        }
+        if (inType == kdouble && outType == kfloat)
+        {
+            double *inp = (double *)s;
+            for (j = 0; j < count; j++)
+            {
+                if (isnan(inp[j])) ((float *)d)[j] = NAN;
+            }
+        }
+    }
+
+    return CL_SUCCESS;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+uint64_t GetTime(void)
+{
+#if defined(__APPLE__)
+    return mach_absolute_time();
+#elif defined(_MSC_VER)
+    return ReadTime();
+#else
+    // mach_absolute_time is a high precision timer with precision < 1
+    // microsecond.
+#warning need accurate clock here.  Times are invalid.
+    return 0;
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Note: not called reentrantly
+void WriteInputBufferComplete(void *data)
+{
+    cl_int status;
+    WriteInputBufferInfo *info = (WriteInputBufferInfo *)data;
+    cl_uint count = info->count;
+    int vectorSize;
+
+    info->barrierCount = gMaxVectorSize - gMinVectorSize;
+
+    // now that we know that the write buffer is complete, enqueue callbacks to
+    // wait for the main thread to finish calculating the reference results.
+    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
+    {
+        size_t workItemCount =
+            (count + vectorSizes[vectorSize] - 1) / (vectorSizes[vectorSize]);
+
+        if ((status = conv_test::RunKernel(info->calcInfo[vectorSize]->kernel,
+                                           gInBuffer, gOutBuffers[vectorSize],
+                                           workItemCount)))
+        {
+            gFailCount++;
+            return;
+        }
+
+        info->calcInfo[vectorSize]->p = clEnqueueMapBuffer(
+            gQueue, gOutBuffers[vectorSize], CL_TRUE,
+            CL_MAP_READ | CL_MAP_WRITE, 0, count * gTypeSizes[info->outType], 0,
+            NULL, NULL, &status);
+        {
+            if (status)
+            {
+                vlog_error("ERROR: WriteInputBufferComplete calback failed "
+                           "with status: %d\n",
+                           status);
+                gFailCount++;
+                return;
+            }
+        }
+    }
+
+    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
+    {
+        MapResultValuesComplete(info->calcInfo[vectorSize]);
+    }
+
+    // Make sure the work starts moving -- otherwise we may deadlock
+    if ((status = clFlush(gQueue)))
+    {
+        vlog_error(
+            "ERROR: WriteInputBufferComplete calback failed with status: %d\n",
+            status);
+        gFailCount++;
+        return;
+    }
+
+    // e was already released by the main thread. It should be destroyed
+    // automatically soon after we exit.
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
+                       RoundingMode round, int vectorSize, cl_kernel *outKernel)
+{
+    cl_program program;
+    char testName[256];
+    int error = 0;
+
+    std::ostringstream source;
+    if (outType == kdouble || inType == kdouble)
+        source << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+
+    // Create the program. This is a bit complicated because we are trying to
+    // avoid byte and short stores.
+    if (0 == vectorSize)
+    {
+        // Create the type names.
+        char inName[32];
+        char outName[32];
+        strncpy(inName, gTypeNames[inType], sizeof(inName));
+        strncpy(outName, gTypeNames[outType], sizeof(outName));
+        sprintf(testName, "test_implicit_%s_%s", outName, inName);
+
+        source << "__kernel void " << testName << "( __global " << inName
+               << " *src, __global " << outName << " *dest )\n";
+        source << "{\n";
+        source << "   size_t i = get_global_id(0);\n";
+        source << "   dest[i] =  src[i];\n";
+        source << "}\n";
+
+        vlog("Building implicit %s -> %s conversion test\n", gTypeNames[inType],
+             gTypeNames[outType]);
+        fflush(stdout);
+    }
+    else
+    {
+        int vectorSizetmp = vectorSizes[vectorSize];
+
+        // Create the type names.
+        char convertString[128];
+        char inName[32];
+        char outName[32];
+        switch (vectorSizetmp)
+        {
+            case 1:
+                strncpy(inName, gTypeNames[inType], sizeof(inName));
+                strncpy(outName, gTypeNames[outType], sizeof(outName));
+                snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
+                         outName, gSaturationNames[sat],
+                         gRoundingModeNames[round]);
+                snprintf(testName, 256, "test_%s_%s", convertString, inName);
+                vlog("Building %s( %s ) test\n", convertString, inName);
+                break;
+            case 3:
+                strncpy(inName, gTypeNames[inType], sizeof(inName));
+                strncpy(outName, gTypeNames[outType], sizeof(outName));
+                snprintf(convertString, sizeof(convertString),
+                         "convert_%s3%s%s", outName, gSaturationNames[sat],
+                         gRoundingModeNames[round]);
+                snprintf(testName, 256, "test_%s_%s3", convertString, inName);
+                vlog("Building %s( %s3 ) test\n", convertString, inName);
+                break;
+            default:
+                snprintf(inName, sizeof(inName), "%s%d", gTypeNames[inType],
+                         vectorSizetmp);
+                snprintf(outName, sizeof(outName), "%s%d", gTypeNames[outType],
+                         vectorSizetmp);
+                snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
+                         outName, gSaturationNames[sat],
+                         gRoundingModeNames[round]);
+                snprintf(testName, 256, "test_%s_%s", convertString, inName);
+                vlog("Building %s( %s ) test\n", convertString, inName);
+                break;
+        }
+        fflush(stdout);
+
+        if (vectorSizetmp == 3)
+        {
+            source << "__kernel void " << testName << "( __global " << inName
+                   << " *src, __global " << outName << " *dest )\n";
+            source << "{\n";
+            source << "   size_t i = get_global_id(0);\n";
+            source << "   if( i + 1 < get_global_size(0))\n";
+            source << "       vstore3( " << convertString
+                   << "( vload3( i, src)), i, dest );\n";
+            source << "   else\n";
+            source << "   {\n";
+            source << "       " << inName << "3 in;\n";
+            source << "       " << outName << "3 out;\n";
+            source << "       if( 0 == (i & 1) )\n";
+            source << "           in.y = src[3*i+1];\n";
+            source << "       in.x = src[3*i];\n";
+            source << "       out = " << convertString << "( in ); \n";
+            source << "       dest[3*i] = out.x;\n";
+            source << "       if( 0 == (i & 1) )\n";
+            source << "           dest[3*i+1] = out.y;\n";
+            source << "   }\n";
+            source << "}\n";
+        }
+        else
+        {
+            source << "__kernel void " << testName << "( __global " << inName
+                   << " *src, __global " << outName << " *dest )\n";
+            source << "{\n";
+            source << "   size_t i = get_global_id(0);\n";
+            source << "   dest[i] = " << convertString << "( src[i] );\n";
+            source << "}\n";
+        }
+    }
+    *outKernel = NULL;
+
+    const char *flags = NULL;
+    if (gForceFTZ) flags = "-cl-denorms-are-zero";
+
+    // build it
+    std::string sourceString = source.str();
+    const char *programSource = sourceString.c_str();
+    error = create_single_kernel_helper(gContext, &program, outKernel, 1,
+                                        &programSource, testName, flags);
+    if (error)
+    {
+        vlog_error("Failed to build kernel/program (err = %d).\n", error);
+        return NULL;
+    }
+
+    return program;
+}
+
+//
+
+int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount)
+{
+    // The global dimensions are just the blockCount to execute since we haven't
+    // set up multiple queues for multiple devices.
+    int error;
+
+    error = clSetKernelArg(kernel, 0, sizeof(inBuf), &inBuf);
+    error |= clSetKernelArg(kernel, 1, sizeof(outBuf), &outBuf);
+
+    if (error)
+    {
+        vlog_error("FAILED -- could not set kernel args (%d)\n", error);
+        return error;
+    }
+
+    if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &blockCount,
+                                        NULL, 0, NULL, NULL)))
+    {
+        vlog_error("FAILED -- could not execute kernel (%d)\n", error);
+        return error;
+    }
+
+    return 0;
+}
+
+
+int GetTestCase(const char *name, Type *outType, Type *inType,
+                SaturationMode *sat, RoundingMode *round)
+{
+    int i;
+
+    // Find the return type
+    for (i = 0; i < kTypeCount; i++)
+        if (name == strstr(name, gTypeNames[i]))
+        {
+            *outType = (Type)i;
+            name += strlen(gTypeNames[i]);
+
+            break;
+        }
+
+    if (i == kTypeCount) return -1;
+
+    // Check to see if _sat appears next
+    *sat = (SaturationMode)0;
+    for (i = 1; i < kSaturationModeCount; i++)
+        if (name == strstr(name, gSaturationNames[i]))
+        {
+            *sat = (SaturationMode)i;
+            name += strlen(gSaturationNames[i]);
+            break;
+        }
+
+    *round = (RoundingMode)0;
+    for (i = 1; i < kRoundingModeCount; i++)
+        if (name == strstr(name, gRoundingModeNames[i]))
+        {
+            *round = (RoundingMode)i;
+            name += strlen(gRoundingModeNames[i]);
+            break;
+        }
+
+    if (*name != '_') return -2;
+    name++;
+
+    for (i = 0; i < kTypeCount; i++)
+        if (name == strstr(name, gTypeNames[i]))
+        {
+            *inType = (Type)i;
+            name += strlen(gTypeNames[i]);
+
+            break;
+        }
+
+    if (i == kTypeCount) return -3;
+
+    if (*name != '\0') return -4;
+
+    return 0;
+}
+
+} // namespace conv_test
diff --git a/test_conformance/conversions/basic_test_conversions.h b/test_conformance/conversions/basic_test_conversions.h
index ab887af..2314ee7 100644
--- a/test_conformance/conversions/basic_test_conversions.h
+++ b/test_conformance/conversions/basic_test_conversions.h
@@ -1,6 +1,6 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
+// Copyright (c) 2023 The Khronos Group Inc.
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -16,8 +16,6 @@
 #ifndef BASIC_TEST_CONVERSIONS_H
 #define BASIC_TEST_CONVERSIONS_H
 
-#include "harness/compat.h"
-
 #if !defined(_WIN32)
 #include <unistd.h>
 #endif
@@ -33,22 +31,23 @@
 #endif
 
 #include "harness/mt19937.h"
+#include "harness/testHarness.h"
+#include "harness/typeWrappers.h"
 
-typedef void (*Convert)( void *dest, void *src, size_t );
+#include <memory>
+#include <tuple>
+#include <vector>
 
-#define kVectorSizeCount    6
-#define kMaxVectorSize      16
+#include "conversions_data_info.h"
 
-typedef enum
-{
-    kUnsaturated = 0,
-    kSaturated,
+#define kVectorSizeCount 6
+#define kMaxVectorSize 16
+#define kPageSize 4096
 
-    kSaturationModeCount
-}SaturationMode;
+#define BUFFER_SIZE (1024 * 1024)
+#define EMBEDDED_REDUCTION_FACTOR 16
+#define PERF_LOOP_COUNT 100
 
-extern Convert gConversions[kTypeCount][kTypeCount];                // [dest format][source format]
-extern Convert gSaturatedConversions[kTypeCount][kTypeCount];       // [dest format][source format]
 extern const char *gTypeNames[ kTypeCount ];
 extern const char *gRoundingModeNames[ kRoundingModeCount ];        // { "", "_rte", "_rtp", "_rtn", "_rtz" }
 extern const char *gSaturationNames[ kSaturationModeCount ];        // { "", "_sat" }
@@ -68,5 +67,324 @@
 typedef int (*CheckResults)( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
 extern CheckResults gCheckResults[ kTypeCount ];
 
+#define kCallStyleCount (kVectorSizeCount + 1 /* for implicit scalar */)
+
+extern MTdata gMTdata;
+extern cl_command_queue gQueue;
+extern cl_context gContext;
+extern cl_mem gInBuffer;
+extern cl_mem gOutBuffers[];
+extern int gHasDouble;
+extern int gTestDouble;
+extern int gWimpyMode;
+extern int gWimpyReductionFactor;
+extern int gSkipTesting;
+extern int gMinVectorSize;
+extern int gMaxVectorSize;
+extern int gForceFTZ;
+extern int gTimeResults;
+extern int gReportAverageTimes;
+extern int gStartTestNumber;
+extern int gEndTestNumber;
+extern int gIsRTZ;
+extern void *gIn;
+extern void *gRef;
+extern void *gAllowZ;
+extern void *gOut[];
+
+extern const char **argList;
+extern int argCount;
+
+extern const char *sizeNames[];
+extern int vectorSizes[];
+
+extern size_t gComputeDevices;
+extern uint32_t gDeviceFrequency;
+
+namespace conv_test {
+
+cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
+                       RoundingMode round, int vectorSize,
+                       cl_kernel *outKernel);
+
+int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount);
+
+int GetTestCase(const char *name, Type *outType, Type *inType,
+                SaturationMode *sat, RoundingMode *round);
+
+cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p);
+cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p);
+uint64_t GetTime(void);
+
+void WriteInputBufferComplete(void *);
+void *FlushToZero(void);
+void UnFlushToZero(void *);
+}
+
+struct CalcRefValsBase
+{
+    virtual ~CalcRefValsBase() = default;
+    virtual int check_result(void *, uint32_t, int) { return 0; }
+
+    // pointer back to the parent WriteInputBufferInfo struct
+    struct WriteInputBufferInfo *parent;
+    clKernelWrapper kernel; // the kernel for this vector size
+    clProgramWrapper program; // the program for this vector size
+    cl_uint vectorSize; // the vector size for this callback chain
+    void *p; // the pointer to mapped result data for this vector size
+    cl_int result;
+};
+
+template <typename InType, typename OutType>
+struct CalcRefValsPat : CalcRefValsBase
+{
+    int check_result(void *, uint32_t, int) override;
+};
+
+struct WriteInputBufferInfo
+{
+    WriteInputBufferInfo()
+        : calcReferenceValues(nullptr), doneBarrier(nullptr), count(0),
+          outType(kuchar), inType(kuchar), barrierCount(0)
+    {}
+
+    volatile cl_event
+        calcReferenceValues; // user event which signals when main thread is
+                             // done calculating reference values
+    volatile cl_event
+        doneBarrier; // user event which signals when worker threads are done
+    cl_uint count; // the number of elements in the array
+    Type outType; // the data type of the conversion result
+    Type inType; // the data type of the conversion input
+    volatile int barrierCount;
+
+    std::vector<std::unique_ptr<CalcRefValsBase>> calcInfo;
+};
+
+// Must be aligned with Type enums!
+using TypeIter = std::tuple<cl_uchar, cl_char, cl_ushort, cl_short, cl_uint,
+                            cl_int, cl_float, cl_double, cl_ulong, cl_long>;
+
+// Helper test fixture for constructing OpenCL objects used in testing
+// a variety of simple command-buffer enqueue scenarios.
+struct ConversionsTest
+{
+    virtual ~ConversionsTest() = default;
+
+    ConversionsTest(cl_device_id device, cl_context context,
+                    cl_command_queue queue);
+
+    cl_int SetUp(int elements);
+
+    // Test body returning an OpenCL error code
+    cl_int Run();
+
+    template <typename InType, typename OutType>
+    int DoTest(Type outType, Type inType, SaturationMode sat,
+               RoundingMode round);
+
+    template <typename InType, typename OutType>
+    void TestTypesConversion(const Type &inType, const Type &outType, int &tn,
+                             const int smvs);
+
+protected:
+    cl_context context;
+    cl_device_id device;
+    cl_command_queue queue;
+
+    size_t num_elements;
+
+    TypeIter typeIterator;
+};
+
+struct CustomConversionsTest : ConversionsTest
+{
+    CustomConversionsTest(cl_device_id device, cl_context context,
+                          cl_command_queue queue)
+        : ConversionsTest(device, context, queue)
+    {}
+
+    cl_int Run();
+};
+
+template <class T>
+int MakeAndRunTest(cl_device_id device, cl_context context,
+                   cl_command_queue queue, int num_elements)
+{
+    T test_fixture(device, context, queue);
+
+    cl_int error = test_fixture.SetUp(num_elements);
+    test_error_ret(error, "Error in test initialization", TEST_FAIL);
+
+    return test_fixture.Run();
+}
+
+struct TestType
+{
+    template <typename T> bool testType(Type in)
+    {
+        switch (in)
+        {
+            default: return false;
+            case kuchar: return std::is_same<cl_uchar, T>::value;
+            case kchar: return std::is_same<cl_char, T>::value;
+            case kushort: return std::is_same<cl_ushort, T>::value;
+            case kshort: return std::is_same<cl_short, T>::value;
+            case kuint: return std::is_same<cl_uint, T>::value;
+            case kint: return std::is_same<cl_int, T>::value;
+            case kfloat: return std::is_same<cl_float, T>::value;
+            case kdouble: return std::is_same<cl_double, T>::value;
+            case kulong: return std::is_same<cl_ulong, T>::value;
+            case klong: return std::is_same<cl_long, T>::value;
+        }
+    }
+};
+
+// Helper structures to iterate over all tuple attributes of different types
+struct IterOverTypes : public TestType
+{
+    IterOverTypes(const TypeIter &typeIter, ConversionsTest &test)
+        : inType((Type)0), outType((Type)0), typeIter(typeIter), test(test),
+          testNumber(-1), startMinVectorSize(gMinVectorSize)
+    {}
+
+    void Run() { for_each_out_elem(typeIter); }
+
+protected:
+    template <std::size_t Out = 0, typename OutType>
+    void iterate_out_type(const OutType &t)
+    {
+        for_each_in_elem<0, Out, OutType>(typeIter);
+        outType = (Type)(outType + 1);
+        inType = (Type)0;
+    }
+
+    template <std::size_t In, std::size_t Out, typename OutType,
+              typename InType>
+    void iterate_in_type(const InType &t)
+    {
+        if (!testType<InType>(inType)) vlog_error("Unexpected data type!\n");
+
+        if (!testType<OutType>(outType)) vlog_error("Unexpected data type!\n");
+
+        // run the conversions
+        test.TestTypesConversion<InType, OutType>(inType, outType, testNumber,
+                                                  startMinVectorSize);
+        inType = (Type)(inType + 1);
+    }
+
+    template <std::size_t Out = 0, typename... Tp>
+    inline typename std::enable_if<Out == sizeof...(Tp), void>::type
+    for_each_out_elem(
+        const std::tuple<Tp...> &) // Unused arguments are given no names.
+    {}
+
+    template <std::size_t Out = 0, typename... Tp>
+        inline typename std::enable_if < Out<sizeof...(Tp), void>::type
+        for_each_out_elem(const std::tuple<Tp...> &t)
+    {
+        iterate_out_type<Out>(std::get<Out>(t));
+        for_each_out_elem<Out + 1, Tp...>(t);
+    }
+
+    template <std::size_t In = 0, std::size_t Out, typename OutType,
+              typename... Tp>
+    inline typename std::enable_if<In == sizeof...(Tp), void>::type
+    for_each_in_elem(
+        const std::tuple<Tp...> &) // Unused arguments are given no names.
+    {}
+
+    template <std::size_t In = 0, std::size_t Out, typename OutType,
+              typename... Tp>
+        inline typename std::enable_if < In<sizeof...(Tp), void>::type
+        for_each_in_elem(const std::tuple<Tp...> &t)
+    {
+        iterate_in_type<In, Out, OutType>(std::get<In>(t));
+        for_each_in_elem<In + 1, Out, OutType, Tp...>(t);
+    }
+
+protected:
+    Type inType;
+    Type outType;
+    const TypeIter &typeIter;
+    ConversionsTest &test;
+    int testNumber;
+    int startMinVectorSize;
+};
+
+
+// Helper structures to select type 2 type conversion test case
+struct IterOverSelectedTypes : public TestType
+{
+    IterOverSelectedTypes(const TypeIter &typeIter, ConversionsTest &test,
+                          const Type in, const Type out,
+                          const RoundingMode round, const SaturationMode sat)
+        : inType(in), outType(out), rounding(round), saturation(sat),
+          typeIter(typeIter), test(test), testNumber(-1),
+          startMinVectorSize(gMinVectorSize)
+    {}
+
+    void Run() { for_each_out_elem(typeIter); }
+
+protected:
+    template <std::size_t Out = 0, typename OutType>
+    void iterate_out_type(const OutType &t)
+    {
+        for_each_in_elem<0, Out, OutType>(typeIter);
+    }
+
+    template <std::size_t In, std::size_t Out, typename OutType,
+              typename InType>
+    void iterate_in_type(const InType &t)
+    {
+        if (testType<InType>(inType) && testType<OutType>(outType))
+        {
+            // run selected conversion
+            // testing of the result will happen afterwards
+            test.DoTest<InType, OutType>(outType, inType, saturation, rounding);
+        }
+    }
+
+    template <std::size_t Out = 0, typename... Tp>
+    inline typename std::enable_if<Out == sizeof...(Tp), void>::type
+    for_each_out_elem(const std::tuple<Tp...> &)
+    {}
+
+    template <std::size_t Out = 0, typename... Tp>
+        inline typename std::enable_if < Out<sizeof...(Tp), void>::type
+        for_each_out_elem(const std::tuple<Tp...> &t)
+    {
+        iterate_out_type<Out>(std::get<Out>(t));
+        for_each_out_elem<Out + 1, Tp...>(t);
+    }
+
+    template <std::size_t In = 0, std::size_t Out, typename OutType,
+              typename... Tp>
+    inline typename std::enable_if<In == sizeof...(Tp), void>::type
+    for_each_in_elem(const std::tuple<Tp...> &)
+    {}
+
+    template <std::size_t In = 0, std::size_t Out, typename OutType,
+              typename... Tp>
+        inline typename std::enable_if < In<sizeof...(Tp), void>::type
+        for_each_in_elem(const std::tuple<Tp...> &t)
+    {
+        iterate_in_type<In, Out, OutType>(std::get<In>(t));
+        for_each_in_elem<In + 1, Out, OutType, Tp...>(t);
+    }
+
+protected:
+    Type inType;
+    Type outType;
+    RoundingMode rounding;
+    SaturationMode saturation;
+
+    const TypeIter &typeIter;
+    ConversionsTest &test;
+    int testNumber;
+    int startMinVectorSize;
+};
+
+
 #endif /* BASIC_TEST_CONVERSIONS_H */
 
diff --git a/test_conformance/conversions/conversions_data_info.h b/test_conformance/conversions/conversions_data_info.h
new file mode 100644
index 0000000..4f46a24
--- /dev/null
+++ b/test_conformance/conversions/conversions_data_info.h
@@ -0,0 +1,792 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef CONVERSIONS_DATA_INFO_H
+#define CONVERSIONS_DATA_INFO_H
+
+#if defined(__APPLE__)
+#include <OpenCL/opencl.h>
+#else
+#include <CL/opencl.h>
+#endif
+
+#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
+#include "fplib.h"
+extern bool qcom_sat;
+extern roundingMode qcom_rm;
+#endif
+
+#include "harness/mt19937.h"
+#include "harness/rounding_mode.h"
+
+#include <vector>
+
+#if defined(__linux__)
+#include <sys/param.h>
+#include <libgen.h>
+#endif
+
+extern size_t gTypeSizes[kTypeCount];
+extern void *gIn;
+
+
+typedef enum
+{
+    kUnsaturated = 0,
+    kSaturated,
+
+    kSaturationModeCount
+} SaturationMode;
+
+struct DataInitInfo
+{
+    cl_ulong start;
+    cl_uint size;
+    Type outType;
+    Type inType;
+    SaturationMode sat;
+    RoundingMode round;
+    cl_uint threads;
+
+    static std::vector<uint32_t> specialValuesUInt;
+    static std::vector<float> specialValuesFloat;
+    static std::vector<double> specialValuesDouble;
+};
+
+struct DataInitBase : public DataInitInfo
+{
+    virtual ~DataInitBase() = default;
+
+    explicit DataInitBase(const DataInitInfo &agg): DataInitInfo(agg) {}
+    virtual void conv_array(void *out, void *in, size_t n) {}
+    virtual void conv_array_sat(void *out, void *in, size_t n) {}
+    virtual void init(const cl_uint &, const cl_uint &) {}
+};
+
+template <typename InType, typename OutType>
+struct DataInfoSpec : public DataInitBase
+{
+    explicit DataInfoSpec(const DataInitInfo &agg);
+
+    // helpers
+    float round_to_int(float f);
+    long long round_to_int_and_clamp(double d);
+
+    OutType absolute(const OutType &x);
+
+    // actual conversion of reference values
+    void conv(OutType *out, InType *in);
+    void conv_sat(OutType *out, InType *in);
+
+    // min/max ranges for output type of data
+    std::pair<OutType, OutType> ranges;
+
+    // matrix of clamping ranges for each rounding type
+    std::vector<std::pair<InType, InType>> clamp_ranges;
+
+    std::vector<MTdataHolder> mdv;
+
+    void conv_array(void *out, void *in, size_t n) override
+    {
+        for (size_t i = 0; i < n; i++)
+            conv(&((OutType *)out)[i], &((InType *)in)[i]);
+    }
+
+    void conv_array_sat(void *out, void *in, size_t n) override
+    {
+        for (size_t i = 0; i < n; i++)
+            conv_sat(&((OutType *)out)[i], &((InType *)in)[i]);
+    }
+
+    void init(const cl_uint &, const cl_uint &) override;
+    InType clamp(const InType &);
+    inline float fclamp(float lo, float v, float hi)
+    {
+        v = v < lo ? lo : v;
+        return v < hi ? v : hi;
+    }
+
+    inline double dclamp(double lo, double v, double hi)
+    {
+        v = v < lo ? lo : v;
+        return v < hi ? v : hi;
+    }
+};
+
+template <typename InType, typename OutType>
+DataInfoSpec<InType, OutType>::DataInfoSpec(const DataInitInfo &agg)
+    : DataInitBase(agg), mdv(0)
+{
+    if (std::is_same<cl_float, OutType>::value)
+        ranges = std::make_pair(CL_FLT_MIN, CL_FLT_MAX);
+    else if (std::is_same<cl_double, OutType>::value)
+        ranges = std::make_pair(CL_DBL_MIN, CL_DBL_MAX);
+    else if (std::is_same<cl_uchar, OutType>::value)
+        ranges = std::make_pair(0, CL_UCHAR_MAX);
+    else if (std::is_same<cl_char, OutType>::value)
+        ranges = std::make_pair(CL_CHAR_MIN, CL_CHAR_MAX);
+    else if (std::is_same<cl_ushort, OutType>::value)
+        ranges = std::make_pair(0, CL_USHRT_MAX);
+    else if (std::is_same<cl_short, OutType>::value)
+        ranges = std::make_pair(CL_SHRT_MIN, CL_SHRT_MAX);
+    else if (std::is_same<cl_uint, OutType>::value)
+        ranges = std::make_pair(0, CL_UINT_MAX);
+    else if (std::is_same<cl_int, OutType>::value)
+        ranges = std::make_pair(CL_INT_MIN, CL_INT_MAX);
+    else if (std::is_same<cl_ulong, OutType>::value)
+        ranges = std::make_pair(0, CL_ULONG_MAX);
+    else if (std::is_same<cl_long, OutType>::value)
+        ranges = std::make_pair(CL_LONG_MIN, CL_LONG_MAX);
+
+    // clang-format off
+    // for readability sake keep this section unformatted
+    if (std::is_floating_point<InType>::value)
+    { // from float/double
+        InType outMin = static_cast<InType>(ranges.first);
+        InType outMax = static_cast<InType>(ranges.second);
+
+        InType eps = std::is_same<InType, cl_float>::value ? (InType) FLT_EPSILON : (InType) DBL_EPSILON;
+        if (std::is_integral<OutType>::value)
+        { // to char/uchar/short/ushort/int/uint/long/ulong
+            if (sizeof(OutType)<=sizeof(cl_short))
+            { // to char/uchar/short/ushort
+                clamp_ranges=
+                {{outMin-0.5f, outMax + 0.5f - outMax * 0.5f * eps},
+                  {outMin-0.5f, outMax + 0.5f - outMax * 0.5f * eps},
+                  {outMin-1.0f+(std::is_signed<OutType>::value?outMax:0.5f)*eps, outMax-1.f},
+                  {outMin-0.0f, outMax - outMax * 0.5f * eps },
+                  {outMin-1.0f+(std::is_signed<OutType>::value?outMax:0.5f)*eps, outMax - outMax * 0.5f * eps}};
+            }
+            else if (std::is_same<InType, cl_float>::value)
+            { // from float
+                if (std::is_same<OutType, cl_uint>::value)
+                { // to uint
+                    clamp_ranges=
+                    { {outMin-0.5f, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7)},
+                      {outMin-0.5f, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7)},
+                      {outMin-1.0f+0.5f*eps, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7)},
+                      {outMin-0.0f, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7) },
+                      {outMin-1.0f+0.5f*eps, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7)}};
+                }
+                else if (std::is_same<OutType, cl_int>::value)
+                { // to int
+                    clamp_ranges=
+                    { {outMin, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6)},
+                      {outMin, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6)},
+                      {outMin, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6)},
+                      {outMin, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6) },
+                      {outMin, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6)}};
+                }
+                else if (std::is_same<OutType, cl_ulong>::value)
+                { // to ulong
+                    clamp_ranges=
+                    {{outMin-0.5f, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39)},
+                      {outMin-0.5f, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39)},
+                      {outMin-1.0f+(std::is_signed<OutType>::value?outMax:0.5f)*eps, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39)},
+                      {outMin-0.0f, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39) },
+                      {outMin-1.0f+(std::is_signed<OutType>::value?outMax:0.5f)*eps, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39)}};
+                }
+                else if (std::is_same<OutType, cl_long>::value)
+                { // to long
+                    clamp_ranges=
+                    { {MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38)},
+                      {MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38)},
+                      {MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38)},
+                      {MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38)},
+                      {MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38)}};
+                }
+            }
+            else
+            { // from double
+                if (std::is_same<OutType, cl_uint>::value)
+                { // to uint
+                    clamp_ranges=
+                    { {outMin-0.5f, outMax + 0.5 - MAKE_HEX_DOUBLE(0x1.0p31, 0x1LL, 31) * eps},
+                      {outMin-0.5f, outMax + 0.5 - MAKE_HEX_DOUBLE(0x1.0p31, 0x1LL, 31) * eps},
+                      {outMin-1.0f+0.5f*eps, outMax},
+                      {outMin-0.0f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp31, 0x1fffffffffffffLL, -21) },
+                      {outMin-1.0f+0.5f*eps, MAKE_HEX_DOUBLE(0x1.fffffffffffffp31, 0x1fffffffffffffLL, -21)}};
+                }
+                else if (std::is_same<OutType, cl_int>::value)
+                { // to int
+                    clamp_ranges=
+                    { {outMin-0.5f, outMax + 0.5 - MAKE_HEX_DOUBLE(0x1.0p30, 0x1LL, 30) * eps},
+                      {outMin-0.5f, outMax + 0.5 - MAKE_HEX_DOUBLE(0x1.0p30, 0x1LL, 30) * eps},
+                      {outMin-1.0f+outMax*eps, outMax},
+                      {outMin-0.0f, outMax + 1.0 - MAKE_HEX_DOUBLE(0x1.0p30, 0x1LL, 30) * eps },
+                      {outMin-1.0f+outMax*eps, outMax + 1.0 - MAKE_HEX_DOUBLE(0x1.0p30, 0x1LL, 30) * eps}};
+                }
+                else if (std::is_same<OutType, cl_ulong>::value)
+                { // to ulong
+                    clamp_ranges=
+                    {{outMin-0.5f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11)},
+                      {outMin-0.5f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11)},
+                      {outMin-1.0f+(std::is_signed<OutType>::value?outMax:0.5f)*eps, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11)},
+                      {outMin-0.0f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11) },
+                      {outMin-1.0f+(std::is_signed<OutType>::value?outMax:0.5f)*eps, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11)}};
+                }
+                else if (std::is_same<OutType, cl_long>::value)
+                { // to long
+                    clamp_ranges=
+                    { {MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10)},
+                      {MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10)},
+                      {MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10)},
+                      {MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10)},
+                      {MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10)}};
+                }
+            }
+        }
+    }
+    // clang-format on
+}
+
+template <typename InType, typename OutType>
+float DataInfoSpec<InType, OutType>::round_to_int(float f)
+{
+    static const float magic[2] = { MAKE_HEX_FLOAT(0x1.0p23f, 0x1, 23),
+                                    -MAKE_HEX_FLOAT(0x1.0p23f, 0x1, 23) };
+
+    // Round fractional values to integer in round towards nearest mode
+    if (fabsf(f) < MAKE_HEX_FLOAT(0x1.0p23f, 0x1, 23))
+    {
+        volatile float x = f;
+        float magicVal = magic[f < 0];
+
+#if defined(__SSE__)
+        // Defeat x87 based arithmetic, which cant do FTZ, and will round this
+        // incorrectly
+        __m128 v = _mm_set_ss(x);
+        __m128 m = _mm_set_ss(magicVal);
+        v = _mm_add_ss(v, m);
+        v = _mm_sub_ss(v, m);
+        _mm_store_ss((float *)&x, v);
+#else
+        x += magicVal;
+        x -= magicVal;
+#endif
+        f = x;
+    }
+    return f;
+}
+
+template <typename InType, typename OutType>
+long long DataInfoSpec<InType, OutType>::round_to_int_and_clamp(double f)
+{
+    static const double magic[2] = { MAKE_HEX_DOUBLE(0x1.0p52, 0x1LL, 52),
+                                     MAKE_HEX_DOUBLE(-0x1.0p52, -0x1LL, 52) };
+
+    if (f >= -(double)LLONG_MIN) return LLONG_MAX;
+
+    if (f <= (double)LLONG_MIN) return LLONG_MIN;
+
+    // Round fractional values to integer in round towards nearest mode
+    if (fabs(f) < MAKE_HEX_DOUBLE(0x1.0p52, 0x1LL, 52))
+    {
+        volatile double x = f;
+        double magicVal = magic[f < 0];
+#if defined(__SSE2__) || defined(_MSC_VER)
+        // Defeat x87 based arithmetic, which cant do FTZ, and will round this
+        // incorrectly
+        __m128d v = _mm_set_sd(x);
+        __m128d m = _mm_set_sd(magicVal);
+        v = _mm_add_sd(v, m);
+        v = _mm_sub_sd(v, m);
+        _mm_store_sd((double *)&x, v);
+#else
+        x += magicVal;
+        x -= magicVal;
+#endif
+        f = x;
+    }
+    return (long long)f;
+}
+
+template <typename InType, typename OutType>
+OutType DataInfoSpec<InType, OutType>::absolute(const OutType &x)
+{
+    union {
+        cl_uint u;
+        OutType f;
+    } u;
+    u.f = x;
+    if (std::is_same<OutType, float>::value)
+        u.u &= 0x7fffffff;
+    else if (std::is_same<OutType, double>::value)
+        u.u &= 0x7fffffffffffffffULL;
+    else
+        log_error("Unexpected argument type of DataInfoSpec::absolute");
+
+    return u.f;
+}
+
+template <typename InType, typename OutType>
+void DataInfoSpec<InType, OutType>::conv(OutType *out, InType *in)
+{
+    if (std::is_same<cl_float, InType>::value)
+    {
+        cl_float inVal = *in;
+
+        if (std::is_floating_point<OutType>::value)
+        {
+            *out = (OutType)inVal;
+        }
+        else if (std::is_same<cl_ulong, OutType>::value)
+        {
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+            // VS2005 (at least) on x86 uses fistp to store the float as a
+            // 64-bit int. However, fistp stores it as a signed int, and some of
+            // the test values won't fit into a signed int. (These test values
+            // are >= 2^63.) The result on VS2005 is that these end up silently
+            // (at least by default settings) clamped to the max lowest ulong.
+            cl_float x = round_to_int(inVal);
+            if (x >= 9223372036854775808.0f)
+            {
+                x -= 9223372036854775808.0f;
+                ((cl_ulong *)out)[0] = x;
+                ((cl_ulong *)out)[0] += 9223372036854775808ULL;
+            }
+            else
+            {
+                ((cl_ulong *)out)[0] = x;
+            }
+#else
+            *out = round_to_int(inVal);
+#endif
+        }
+        else if (std::is_same<cl_long, OutType>::value)
+        {
+            *out = round_to_int_and_clamp(inVal);
+        }
+        else
+            *out = round_to_int(inVal);
+    }
+    else if (std::is_same<cl_double, InType>::value)
+    {
+        if (std::is_same<cl_float, OutType>::value)
+            *out = (OutType)*in;
+        else
+            *out = rint(*in);
+    }
+    else if (std::is_same<cl_ulong, InType>::value
+             || std::is_same<cl_long, InType>::value)
+    {
+        if (std::is_same<cl_double, OutType>::value)
+        {
+#if defined(_MSC_VER)
+            cl_ulong l = ((cl_ulong *)in)[0];
+            double result;
+
+            if (std::is_same<cl_ulong, InType>::value)
+            {
+                cl_long sl = ((cl_long)l < 0) ? (cl_long)((l >> 1) | (l & 1))
+                                              : (cl_long)l;
+#if defined(_M_X64)
+                _mm_store_sd(&result, _mm_cvtsi64_sd(_mm_setzero_pd(), sl));
+#else
+                result = sl;
+#endif
+                ((double *)out)[0] =
+                    (l == 0 ? 0.0 : (((cl_long)l < 0) ? result * 2.0 : result));
+            }
+            else
+            {
+                _mm_store_sd(&result, _mm_cvtsi64_sd(_mm_setzero_pd(), l));
+                ((double *)out)[0] =
+                    (l == 0 ? 0.0 : result); // Per IEEE-754-2008 5.4.1, 0's
+                                             // always convert to +0.0
+            }
+#else
+            // Use volatile to prevent optimization by Clang compiler
+            volatile InType vi = *in;
+            *out = (vi == 0 ? 0.0 : static_cast<OutType>(vi));
+#endif
+        }
+        else if (std::is_same<cl_float, OutType>::value)
+        {
+            cl_float outVal = 0.f;
+
+#if defined(_MSC_VER) && defined(_M_X64)
+            cl_ulong l = ((cl_ulong *)in)[0];
+            float result;
+            if (std::is_same<cl_ulong, InType>::value)
+            {
+                cl_long sl = ((cl_long)l < 0) ? (cl_long)((l >> 1) | (l & 1))
+                                              : (cl_long)l;
+                _mm_store_ss(&result, _mm_cvtsi64_ss(_mm_setzero_ps(), sl));
+                outVal = (l == 0 ? 0.0f
+                                 : (((cl_long)l < 0) ? result * 2.0f : result));
+            }
+            else
+            {
+                _mm_store_ss(&result, _mm_cvtsi64_ss(_mm_setzero_ps(), l));
+                outVal = (l == 0 ? 0.0f : result); // Per IEEE-754-2008 5.4.1,
+                                                   // 0's always convert to +0.0
+            }
+#else
+            InType l = ((InType *)in)[0];
+#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
+            /* ARM VFP doesn't have hardware instruction for converting from
+             * 64-bit integer to float types, hence GCC ARM uses the
+             * floating-point emulation code despite which -mfloat-abi setting
+             * it is. But the emulation code in libgcc.a has only one rounding
+             * mode (round to nearest even in this case) and ignores the user
+             * rounding mode setting in hardware. As a result setting rounding
+             * modes in hardware won't give correct rounding results for type
+             * covert from 64-bit integer to float using GCC for ARM compiler so
+             * for testing different rounding modes, we need to use alternative
+             * reference function. ARM64 does have an instruction, however we
+             * cannot guarantee the compiler will use it.  On all ARM
+             * architechures use emulation to calculate reference.*/
+            if (std::is_same<cl_ulong, InType>::value)
+                outVal = qcom_u64_2_f32(l, qcom_sat, qcom_rm);
+            else
+                outVal = (l == 0 ? 0.0f : qcom_s64_2_f32(l, qcom_sat, qcom_rm));
+#else
+            outVal = (l == 0 ? 0.0f : (float)l); // Per IEEE-754-2008 5.4.1, 0's
+                                                 // always convert to +0.0
+#endif
+#endif
+
+            *out = outVal;
+        }
+        else
+        {
+            *out = (OutType)*in;
+        }
+    }
+    else
+    {
+        if (std::is_same<cl_float, OutType>::value)
+        {
+            // Use volatile to prevent optimization by Clang compiler
+            volatile InType vi = *in;
+            // Per IEEE-754-2008 5.4.1, 0 always converts to +0.0
+            *out = (vi == 0 ? 0.0f : vi);
+        }
+        else if (std::is_same<cl_double, OutType>::value)
+        {
+            // Per IEEE-754-2008 5.4.1, 0 always converts to +0.0
+            *out = (*in == 0 ? 0.0 : *in);
+        }
+        else
+        {
+            *out = (OutType)*in;
+        }
+    }
+}
+
+#define CLAMP(_lo, _x, _hi)                                                    \
+    ((_x) < (_lo) ? (_lo) : ((_x) > (_hi) ? (_hi) : (_x)))
+
+template <typename InType, typename OutType>
+void DataInfoSpec<InType, OutType>::conv_sat(OutType *out, InType *in)
+{
+    if (std::is_floating_point<InType>::value)
+    {
+        if (std::is_floating_point<OutType>::value)
+        { // in float/double, out float/double
+            *out = (OutType)(*in);
+        }
+        else if ((std::is_same<InType, cl_float>::value)
+                 && std::is_same<cl_ulong, OutType>::value)
+        {
+            cl_float x = round_to_int(*in);
+
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+            // VS2005 (at least) on x86 uses fistp to store the float as a
+            // 64-bit int. However, fistp stores it as a signed int, and some of
+            // the test values won't fit into a signed int. (These test values
+            // are >= 2^63.) The result on VS2005 is that these end up silently
+            // (at least by default settings) clamped to the max lowest ulong.
+            if (x >= 18446744073709551616.0f)
+            { // 2^64
+                *out = 0xFFFFFFFFFFFFFFFFULL;
+            }
+            else if (x < 0)
+            {
+                *out = 0;
+            }
+            else if (x >= 9223372036854775808.0f)
+            { // 2^63
+                x -= 9223372036854775808.0f;
+                *out = x;
+                *out += 9223372036854775808ULL;
+            }
+            else
+            {
+                *out = x;
+            }
+#else
+            *out = x >= MAKE_HEX_DOUBLE(0x1.0p64, 0x1LL, 64)
+                ? 0xFFFFFFFFFFFFFFFFULL
+                : x < 0 ? 0 : (OutType)x;
+#endif
+        }
+        else if ((std::is_same<InType, cl_float>::value)
+                 && std::is_same<cl_long, OutType>::value)
+        {
+            cl_float f = round_to_int(*in);
+            *out = f >= MAKE_HEX_DOUBLE(0x1.0p63, 0x1LL, 63)
+                ? 0x7FFFFFFFFFFFFFFFULL
+                : f < MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63)
+                    ? 0x8000000000000000LL
+                    : (OutType)f;
+        }
+        else if (std::is_same<InType, cl_double>::value
+                 && std::is_same<cl_ulong, OutType>::value)
+        {
+            InType f = rint(*in);
+            *out = f >= MAKE_HEX_DOUBLE(0x1.0p64, 0x1LL, 64)
+                ? 0xFFFFFFFFFFFFFFFFULL
+                : f < 0 ? 0 : (OutType)f;
+        }
+        else if (std::is_same<InType, cl_double>::value
+                 && std::is_same<cl_long, OutType>::value)
+        {
+            InType f = rint(*in);
+            *out = f >= MAKE_HEX_DOUBLE(0x1.0p63, 0x1LL, 63)
+                ? 0x7FFFFFFFFFFFFFFFULL
+                : f < MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63)
+                    ? 0x8000000000000000LL
+                    : (OutType)f;
+        }
+        else
+        { // in float/double, out char/uchar/short/ushort/int/uint
+            *out =
+                CLAMP(ranges.first, round_to_int_and_clamp(*in), ranges.second);
+        }
+    }
+    else if (std::is_integral<InType>::value
+             && std::is_integral<OutType>::value)
+    {
+        {
+            if ((std::is_signed<InType>::value
+                 && std::is_signed<OutType>::value)
+                || (!std::is_signed<InType>::value
+                    && !std::is_signed<OutType>::value))
+            {
+                if (sizeof(InType) <= sizeof(OutType))
+                {
+                    *out = (OutType)*in;
+                }
+                else
+                {
+                    *out = CLAMP(ranges.first, *in, ranges.second);
+                }
+            }
+            else
+            { // mixed signed/unsigned types
+                if (sizeof(InType) < sizeof(OutType))
+                {
+                    *out = (!std::is_signed<InType>::value)
+                        ? (OutType)*in
+                        : CLAMP(0, *in, ranges.second); // *in < 0 ? 0 : *in
+                }
+                else
+                { // bigger/equal mixed signed/unsigned types - always clamp
+                    *out = CLAMP(0, *in, ranges.second);
+                }
+            }
+        }
+    }
+    else
+    { // InType integral, OutType floating
+        *out = std::is_signed<InType>::value ? (OutType)*in
+                                             : absolute((OutType)*in);
+    }
+}
+
+template <typename InType, typename OutType>
+void DataInfoSpec<InType, OutType>::init(const cl_uint &job_id,
+                                         const cl_uint &thread_id)
+{
+    uint64_t ulStart = start;
+    void *pIn = (char *)gIn + job_id * size * gTypeSizes[inType];
+
+    if (std::is_integral<InType>::value)
+    {
+        InType *o = (InType *)pIn;
+        if (sizeof(InType) <= sizeof(cl_short))
+        { // char/uchar/ushort/short
+            for (int i = 0; i < size; i++) o[i] = ulStart++;
+        }
+        else if (sizeof(InType) <= sizeof(cl_int))
+        { // int/uint
+            int i = 0;
+            if (gIsEmbedded)
+                for (i = 0; i < size; i++)
+                    o[i] = (InType)genrand_int32(mdv[thread_id]);
+            else
+                for (i = 0; i < size; i++) o[i] = (InType)i + ulStart;
+
+            if (0 == ulStart)
+            {
+                size_t tableSize = specialValuesUInt.size()
+                    * sizeof(decltype(specialValuesUInt)::value_type);
+                if (sizeof(InType) * size < tableSize)
+                    tableSize = sizeof(InType) * size;
+                memcpy((char *)(o + i) - tableSize, &specialValuesUInt.front(),
+                       tableSize);
+            }
+        }
+        else
+        { // long/ulong
+            cl_ulong *o = (cl_ulong *)pIn;
+            cl_ulong i, j, k;
+
+            i = 0;
+            if (ulStart == 0)
+            {
+                // Try various powers of two
+                for (j = 0; j < (cl_ulong)size && j < 8 * sizeof(cl_ulong); j++)
+                    o[j] = (cl_ulong)1 << j;
+                i = j;
+
+                // try the complement of those
+                for (j = 0; i < (cl_ulong)size && j < 8 * sizeof(cl_ulong); j++)
+                    o[i++] = ~((cl_ulong)1 << j);
+
+                // Try various negative powers of two
+                for (j = 0; i < (cl_ulong)size && j < 8 * sizeof(cl_ulong); j++)
+                    o[i++] = (cl_ulong)0xFFFFFFFFFFFFFFFEULL << j;
+
+                // try various powers of two plus 1, shifted by various amounts
+                for (j = 0; i < (cl_ulong)size && j < 8 * sizeof(cl_ulong); j++)
+                    for (k = 0;
+                         i < (cl_ulong)size && k < 8 * sizeof(cl_ulong) - j;
+                         k++)
+                        o[i++] = (((cl_ulong)1 << j) + 1) << k;
+
+                // try various powers of two minus 1
+                for (j = 0; i < (cl_ulong)size && j < 8 * sizeof(cl_ulong); j++)
+                    for (k = 0;
+                         i < (cl_ulong)size && k < 8 * sizeof(cl_ulong) - j;
+                         k++)
+                        o[i++] = (((cl_ulong)1 << j) - 1) << k;
+
+                // Other patterns
+                cl_ulong pattern[] = {
+                    0x3333333333333333ULL, 0x5555555555555555ULL,
+                    0x9999999999999999ULL, 0x6666666666666666ULL,
+                    0xccccccccccccccccULL, 0xaaaaaaaaaaaaaaaaULL
+                };
+                cl_ulong mask[] = { 0xffffffffffffffffULL,
+                                    0xff00ff00ff00ff00ULL,
+                                    0xffff0000ffff0000ULL,
+                                    0xffffffff00000000ULL };
+                for (j = 0; i < (cl_ulong)size
+                     && j < sizeof(pattern) / sizeof(pattern[0]);
+                     j++)
+                    for (k = 0; i + 2 <= (cl_ulong)size
+                         && k < sizeof(mask) / sizeof(mask[0]);
+                         k++)
+                    {
+                        o[i++] = pattern[j] & mask[k];
+                        o[i++] = pattern[j] & ~mask[k];
+                    }
+            }
+
+            auto &md = mdv[thread_id];
+            for (; i < (cl_ulong)size; i++)
+                o[i] = (cl_ulong)genrand_int32(md)
+                    | ((cl_ulong)genrand_int32(md) << 32);
+        }
+    } // integrals
+    else if (std::is_same<InType, cl_float>::value)
+    {
+        cl_uint *o = (cl_uint *)pIn;
+        int i;
+
+        if (gIsEmbedded)
+            for (i = 0; i < size; i++)
+                o[i] = (cl_uint)genrand_int32(mdv[thread_id]);
+        else
+            for (i = 0; i < size; i++) o[i] = (cl_uint)i + ulStart;
+
+        if (0 == ulStart)
+        {
+            size_t tableSize = specialValuesFloat.size()
+                * sizeof(decltype(specialValuesFloat)::value_type);
+            if (sizeof(InType) * size < tableSize)
+                tableSize = sizeof(InType) * size;
+            memcpy((char *)(o + i) - tableSize, &specialValuesFloat.front(),
+                   tableSize);
+        }
+
+        if (kUnsaturated == sat)
+        {
+            InType *f = (InType *)pIn;
+            for (i = 0; i < size; i++) f[i] = clamp(f[i]);
+        }
+    }
+    else if (std::is_same<InType, cl_double>::value)
+    {
+        InType *o = (InType *)pIn;
+        int i = 0;
+
+        union {
+            uint64_t u;
+            InType d;
+        } u;
+
+        for (i = 0; i < size; i++)
+        {
+            uint64_t z = i + ulStart;
+
+            uint32_t bits = ((uint32_t)z ^ (uint32_t)(z >> 32));
+            // split 0x89abcdef to 0x89abc00000000def
+            u.u = bits & 0xfffU;
+            u.u |= (uint64_t)(bits & ~0xfffU) << 32;
+            // sign extend the leading bit of def segment as sign bit so that
+            // the middle region consists of either all 1s or 0s
+            u.u -= (bits & 0x800U) << 1;
+            o[i] = u.d;
+        }
+
+        if (0 == ulStart)
+        {
+            size_t tableSize = specialValuesDouble.size()
+                * sizeof(decltype(specialValuesDouble)::value_type);
+            if (sizeof(InType) * size < tableSize)
+                tableSize = sizeof(InType) * size;
+            memcpy((char *)(o + i) - tableSize, &specialValuesDouble.front(),
+                   tableSize);
+        }
+
+        if (0 == sat)
+            for (i = 0; i < size; i++) o[i] = clamp(o[i]);
+    }
+}
+
+template <typename InType, typename OutType>
+InType DataInfoSpec<InType, OutType>::clamp(const InType &in)
+{
+    if (std::is_integral<OutType>::value)
+    {
+        if (std::is_same<InType, cl_float>::value)
+        {
+            return fclamp(clamp_ranges[round].first, in,
+                          clamp_ranges[round].second);
+        }
+        else if (std::is_same<InType, cl_double>::value)
+        {
+            return dclamp(clamp_ranges[round].first, in,
+                          clamp_ranges[round].second);
+        }
+    }
+    return in;
+}
+
+#endif /* CONVERSIONS_DATA_INFO_H */
diff --git a/test_conformance/conversions/fplib.h b/test_conformance/conversions/fplib.h
index 534550a..c69b1e8 100644
--- a/test_conformance/conversions/fplib.h
+++ b/test_conformance/conversions/fplib.h
@@ -13,6 +13,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+#ifndef CONVERSIONS_FPLIB_H
+#define CONVERSIONS_FPLIB_H
+
 #include <stdbool.h>
 #include <stdint.h>
 
@@ -28,3 +31,5 @@
 
 float qcom_u64_2_f32(uint64_t data, bool sat, roundingMode rnd);
 float qcom_s64_2_f32(int64_t data, bool sat, roundingMode rnd);
+
+#endif
diff --git a/test_conformance/conversions/test_conversions.cpp b/test_conformance/conversions/test_conversions.cpp
index 2ee0546..dab61dc 100644
--- a/test_conformance/conversions/test_conversions.cpp
+++ b/test_conformance/conversions/test_conversions.cpp
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "harness/compat.h"
-#include "harness/rounding_mode.h"
 #include "harness/ThreadPool.h"
 #include "harness/testHarness.h"
-#include "harness/kernelHelpers.h"
 #include "harness/parseParameters.h"
+#include "harness/mt19937.h"
+
 #if defined(__APPLE__)
 #include <sys/sysctl.h>
 #endif
@@ -33,7 +32,6 @@
 #include <libgen.h>
 #endif
 
-#include "mingw_compat.h"
 #if defined(__MINGW32__)
 #include <sys/param.h>
 #endif
@@ -49,283 +47,77 @@
 #include <time.h>
 
 #include <algorithm>
+#include <type_traits>
+#include <vector>
 
 #include "Sleep.h"
+
 #include "basic_test_conversions.h"
-
-#if (defined(_WIN32) && defined(_MSC_VER))
-// need for _controlfp_s and rouinding modes in RoundingMode
-#include "harness/testHarness.h"
-#endif
-
-#pragma mark -
-#pragma mark globals
-
-#define BUFFER_SIZE (1024 * 1024)
-#define kPageSize 4096
-#define EMBEDDED_REDUCTION_FACTOR 16
-#define PERF_LOOP_COUNT 100
-
-#define kCallStyleCount (kVectorSizeCount + 1 /* for implicit scalar */)
+#include <climits>
+#include <cstring>
 
 #if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
 #include "fplib.h"
-extern bool qcom_sat;
-extern roundingMode qcom_rm;
 #endif
 
-const char **argList = NULL;
-int argCount = 0;
-cl_context gContext = NULL;
-cl_command_queue gQueue = NULL;
-char appName[64] = "ctest";
-int gStartTestNumber = -1;
-int gEndTestNumber = 0;
-#if defined(__APPLE__)
-int gTimeResults = 1;
-#else
-int gTimeResults = 0;
+#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
+/* Rounding modes and saturation for use with qcom 64 bit to float conversion
+ * library */
+bool qcom_sat;
+roundingMode qcom_rm;
 #endif
-int gReportAverageTimes = 0;
-void *gIn = NULL;
-void *gRef = NULL;
-void *gAllowZ = NULL;
-void *gOut[kCallStyleCount] = { NULL };
-cl_mem gInBuffer;
-cl_mem gOutBuffers[kCallStyleCount];
-size_t gComputeDevices = 0;
-uint32_t gDeviceFrequency = 0;
-int gWimpyMode = 0;
-int gWimpyReductionFactor = 128;
-int gSkipTesting = 0;
-int gForceFTZ = 0;
-int gMultithread = 1;
-int gIsRTZ = 0;
-uint32_t gSimdSize = 1;
-int gHasDouble = 0;
-int gTestDouble = 1;
-const char *sizeNames[] = { "", "", "2", "3", "4", "8", "16" };
-const int vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 };
-int gMinVectorSize = 0;
-int gMaxVectorSize = sizeof(vectorSizes) / sizeof(vectorSizes[0]);
-static MTdata gMTdata;
 
-#pragma mark -
-#pragma mark Declarations
 
 static int ParseArgs(int argc, const char **argv);
 static void PrintUsage(void);
 test_status InitCL(cl_device_id device);
-static int GetTestCase(const char *name, Type *outType, Type *inType,
-                       SaturationMode *sat, RoundingMode *round);
-static int DoTest(cl_device_id device, Type outType, Type inType,
-                  SaturationMode sat, RoundingMode round, MTdata d);
-static cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
-                              RoundingMode round, int vectorSize,
-                              cl_kernel *outKernel);
-static int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf,
-                     size_t blockCount);
 
-void *FlushToZero(void);
-void UnFlushToZero(void *);
 
-// Windows (since long double got deprecated) sets the x87 to 53-bit precision
-// (that's x87 default state).  This causes problems with the tests that
-// convert long and ulong to float and double or otherwise deal with values
-// that need more precision than 53-bit. So, set the x87 to 64-bit precision.
-static inline void Force64BitFPUPrecision(void)
-{
-#if __MINGW32__
-    // The usual method is to use _controlfp as follows:
-    //     #include <float.h>
-    //     _controlfp(_PC_64, _MCW_PC);
-    //
-    // _controlfp is available on MinGW32 but not on MinGW64. Instead of having
-    // divergent code just use inline assembly which works for both.
-    unsigned short int orig_cw = 0;
-    unsigned short int new_cw = 0;
-    __asm__ __volatile__("fstcw %0" : "=m"(orig_cw));
-    new_cw = orig_cw | 0x0300; // set precision to 64-bit
-    __asm__ __volatile__("fldcw  %0" ::"m"(new_cw));
-#else
-    /* Implement for other platforms if needed */
-#endif
-}
+const char *gTypeNames[kTypeCount] = { "uchar", "char", "ushort", "short",
+                                       "uint",  "int",  "float",  "double",
+                                       "ulong", "long" };
+
+const char *gRoundingModeNames[kRoundingModeCount] = { "", "_rte", "_rtp",
+                                                       "_rtn", "_rtz" };
+
+const char *gSaturationNames[2] = { "", "_sat" };
+
+size_t gTypeSizes[kTypeCount] = {
+    sizeof(cl_uchar), sizeof(cl_char), sizeof(cl_ushort), sizeof(cl_short),
+    sizeof(cl_uint),  sizeof(cl_int),  sizeof(cl_float),  sizeof(cl_double),
+    sizeof(cl_ulong), sizeof(cl_long),
+};
+
+char appName[64] = "ctest";
+int gMultithread = 1;
+
 
 int test_conversions(cl_device_id device, cl_context context,
                      cl_command_queue queue, int num_elements)
 {
-    int error, i, testNumber = -1;
-    int startMinVectorSize = gMinVectorSize;
-    Type inType, outType;
-    RoundingMode round;
-    SaturationMode sat;
-
     if (argCount)
     {
-        for (i = 0; i < argCount; i++)
-        {
-            if (GetTestCase(argList[i], &outType, &inType, &sat, &round))
-            {
-                vlog_error("\n\t\t**** ERROR:  Unable to parse function name "
-                           "%s.  Skipping....  *****\n\n",
-                           argList[i]);
-                continue;
-            }
-
-            // skip double if we don't have it
-            if (!gTestDouble && (inType == kdouble || outType == kdouble))
-            {
-                if (gHasDouble)
-                {
-                    vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
-                               gTypeNames[outType], gSaturationNames[sat],
-                               gRoundingModeNames[round], gTypeNames[inType]);
-                    vlog("\t\tcl_khr_fp64 enabled, but double testing turned "
-                         "off.\n");
-                }
-
-                continue;
-            }
-
-            // skip longs on embedded
-            if (!gHasLong
-                && (inType == klong || outType == klong || inType == kulong
-                    || outType == kulong))
-            {
-                continue;
-            }
-
-            // Skip the implicit converts if the rounding mode is not default or
-            // test is saturated
-            if (0 == startMinVectorSize)
-            {
-                if (sat || round != kDefaultRoundingMode)
-                    gMinVectorSize = 1;
-                else
-                    gMinVectorSize = 0;
-            }
-
-            if ((error = DoTest(device, outType, inType, sat, round, gMTdata)))
-            {
-                vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
-                           gTypeNames[outType], gSaturationNames[sat],
-                           gRoundingModeNames[round], gTypeNames[inType]);
-            }
-        }
+        return MakeAndRunTest<CustomConversionsTest>(device, context, queue,
+                                                     num_elements);
     }
     else
     {
-        for (outType = (Type)0; outType < kTypeCount;
-             outType = (Type)(outType + 1))
-        {
-            for (inType = (Type)0; inType < kTypeCount;
-                 inType = (Type)(inType + 1))
-            {
-                // skip longs on embedded
-                if (!gHasLong
-                    && (inType == klong || outType == klong || inType == kulong
-                        || outType == kulong))
-                {
-                    continue;
-                }
-
-                for (sat = (SaturationMode)0; sat < kSaturationModeCount;
-                     sat = (SaturationMode)(sat + 1))
-                {
-                    // skip illegal saturated conversions to float type
-                    if (kSaturated == sat
-                        && (outType == kfloat || outType == kdouble))
-                    {
-                        continue;
-                    }
-
-                    for (round = (RoundingMode)0; round < kRoundingModeCount;
-                         round = (RoundingMode)(round + 1))
-                    {
-                        if (++testNumber < gStartTestNumber)
-                        {
-                            //     vlog( "%d) skipping convert_%sn%s%s( %sn
-                            //     )\n", testNumber, gTypeNames[ outType ],
-                            //     gSaturationNames[ sat ],
-                            //     gRoundingModeNames[round], gTypeNames[inType]
-                            //     );
-                            continue;
-                        }
-                        else
-                        {
-                            if (gEndTestNumber > 0
-                                && testNumber >= gEndTestNumber)
-                            {
-                                goto exit;
-                            }
-                        }
-
-                        vlog("%d) Testing convert_%sn%s%s( %sn ):\n",
-                             testNumber, gTypeNames[outType],
-                             gSaturationNames[sat], gRoundingModeNames[round],
-                             gTypeNames[inType]);
-
-                        // skip double if we don't have it
-                        if (!gTestDouble
-                            && (inType == kdouble || outType == kdouble))
-                        {
-                            if (gHasDouble)
-                            {
-                                vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
-                                           "FAILED ** \n",
-                                           testNumber, gTypeNames[outType],
-                                           gSaturationNames[sat],
-                                           gRoundingModeNames[round],
-                                           gTypeNames[inType]);
-                                vlog("\t\tcl_khr_fp64 enabled, but double "
-                                     "testing turned off.\n");
-                            }
-                            continue;
-                        }
-
-                        // Skip the implicit converts if the rounding mode is
-                        // not default or test is saturated
-                        if (0 == startMinVectorSize)
-                        {
-                            if (sat || round != kDefaultRoundingMode)
-                                gMinVectorSize = 1;
-                            else
-                                gMinVectorSize = 0;
-                        }
-
-                        if ((error = DoTest(device, outType, inType, sat, round,
-                                            gMTdata)))
-                        {
-                            vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
-                                       "FAILED ** \n",
-                                       testNumber, gTypeNames[outType],
-                                       gSaturationNames[sat],
-                                       gRoundingModeNames[round],
-                                       gTypeNames[inType]);
-                        }
-                    }
-                }
-            }
-        }
+        return MakeAndRunTest<ConversionsTest>(device, context, queue,
+                                               num_elements);
     }
-
-exit:
-    return gFailCount;
 }
 
+
 test_definition test_list[] = {
     ADD_TEST(conversions),
 };
 
 const int test_num = ARRAY_SIZE(test_list);
 
-#pragma mark -
 
 int main(int argc, const char **argv)
 {
     int error;
-    cl_uint seed = (cl_uint)time(NULL);
 
     argc = parseCustomParam(argc, argv);
     if (argc == -1)
@@ -352,8 +144,8 @@
 #endif
 
     vlog("===========================================================\n");
-    vlog("Random seed: %u\n", seed);
-    gMTdata = init_genrand(seed);
+    vlog("Random seed: %u\n", gRandomSeed);
+    gMTdata = init_genrand(gRandomSeed);
 
     const char *arg[] = { argv[0] };
     int ret =
@@ -378,8 +170,6 @@
     return ret;
 }
 
-#pragma mark -
-#pragma mark setup
 
 static int ParseArgs(int argc, const char **argv)
 {
@@ -509,7 +299,7 @@
         gWimpyMode = 1;
     }
 
-    vlog( "\n" );
+    vlog("\n");
 
     PrintArch();
 
@@ -526,6 +316,7 @@
     return 0;
 }
 
+
 static void PrintUsage(void)
 {
     int i;
@@ -564,63 +355,6 @@
 }
 
 
-static int GetTestCase(const char *name, Type *outType, Type *inType,
-                       SaturationMode *sat, RoundingMode *round)
-{
-    int i;
-
-    // Find the return type
-    for (i = 0; i < kTypeCount; i++)
-        if (name == strstr(name, gTypeNames[i]))
-        {
-            *outType = (Type)i;
-            name += strlen(gTypeNames[i]);
-
-            break;
-        }
-
-    if (i == kTypeCount) return -1;
-
-    // Check to see if _sat appears next
-    *sat = (SaturationMode)0;
-    for (i = 1; i < kSaturationModeCount; i++)
-        if (name == strstr(name, gSaturationNames[i]))
-        {
-            *sat = (SaturationMode)i;
-            name += strlen(gSaturationNames[i]);
-            break;
-        }
-
-    *round = (RoundingMode)0;
-    for (i = 1; i < kRoundingModeCount; i++)
-        if (name == strstr(name, gRoundingModeNames[i]))
-        {
-            *round = (RoundingMode)i;
-            name += strlen(gRoundingModeNames[i]);
-            break;
-        }
-
-    if (*name != '_') return -2;
-    name++;
-
-    for (i = 0; i < kTypeCount; i++)
-        if (name == strstr(name, gTypeNames[i]))
-        {
-            *inType = (Type)i;
-            name += strlen(gTypeNames[i]);
-
-            break;
-        }
-
-    if (i == kTypeCount) return -3;
-
-    if (*name != '\0') return -4;
-
-    return 0;
-}
-
-#pragma mark -
-#pragma mark OpenCL
 
 test_status InitCL(cl_device_id device)
 {
@@ -678,6 +412,20 @@
     }
     gTestDouble &= gHasDouble;
 
+    // detect whether profile of the device is embedded
+    char profile[1024] = "";
+    if ((error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile),
+                                 profile, NULL)))
+    {
+        vlog_error("clGetDeviceInfo failed. (%d)\n", error);
+        return TEST_FAIL;
+    }
+    else if (strstr(profile, "EMBEDDED_PROFILE"))
+    {
+        gIsEmbedded = 1;
+        if (!is_extension_available(device, "cles_khr_int64")) gHasLong = 0;
+    }
+
     gContext = clCreateContext(NULL, 1, &device, notify_callback, NULL, &error);
     if (NULL == gContext || error)
     {
@@ -726,10 +474,6 @@
         }
     }
 
-
-    gMTdata = init_genrand(gRandomSeed);
-
-
     char c[1024];
     static const char *no_yes[] = { "NO", "YES" };
     vlog("\nCompute Device info:\n");
@@ -760,977 +504,4 @@
     return TEST_PASS;
 }
 
-static int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf,
-                     size_t blockCount)
-{
-    // The global dimensions are just the blockCount to execute since we haven't
-    // set up multiple queues for multiple devices.
-    int error;
-
-    error = clSetKernelArg(kernel, 0, sizeof(inBuf), &inBuf);
-    error |= clSetKernelArg(kernel, 1, sizeof(outBuf), &outBuf);
-
-    if (error)
-    {
-        vlog_error("FAILED -- could not set kernel args (%d)\n", error);
-        return error;
-    }
-
-    if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &blockCount,
-                                        NULL, 0, NULL, NULL)))
-    {
-        vlog_error("FAILED -- could not execute kernel (%d)\n", error);
-        return error;
-    }
-
-    return 0;
-}
-
-#if defined(__APPLE__)
-#include <mach/mach_time.h>
-#endif
-
-uint64_t GetTime(void);
-uint64_t GetTime(void)
-{
-#if defined(__APPLE__)
-    return mach_absolute_time();
-#elif defined(_MSC_VER)
-    return ReadTime();
-#else
-    // mach_absolute_time is a high precision timer with precision < 1
-    // microsecond.
-#warning need accurate clock here.  Times are invalid.
-    return 0;
-#endif
-}
-
-
-#if defined(_MSC_VER)
-/* function is defined in "compat.h" */
-#else
-double SubtractTime(uint64_t endTime, uint64_t startTime);
-double SubtractTime(uint64_t endTime, uint64_t startTime)
-{
-    uint64_t diff = endTime - startTime;
-    static double conversion = 0.0;
-
-    if (0.0 == conversion)
-    {
-#if defined(__APPLE__)
-        mach_timebase_info_data_t info = { 0, 0 };
-        kern_return_t err = mach_timebase_info(&info);
-        if (0 == err)
-            conversion = 1e-9 * (double)info.numer / (double)info.denom;
-#else
-        // This function consumes output from GetTime() above, and converts the
-        // time to secionds.
-#warning need accurate ticks to seconds conversion factor here. Times are invalid.
-#endif
-    }
-
-    // strictly speaking we should also be subtracting out timer latency here
-    return conversion * (double)diff;
-}
-#endif
-
-typedef struct CalcReferenceValuesInfo
-{
-    struct WriteInputBufferInfo
-        *parent; // pointer back to the parent WriteInputBufferInfo struct
-    cl_kernel kernel; // the kernel for this vector size
-    cl_program program; // the program for this vector size
-    cl_uint vectorSize; // the vector size for this callback chain
-    void *p; // the pointer to mapped result data for this vector size
-    cl_int result;
-} CalcReferenceValuesInfo;
-
-typedef struct WriteInputBufferInfo
-{
-    volatile cl_event
-        calcReferenceValues; // user event which signals when main thread is
-                             // done calculating reference values
-    volatile cl_event
-        doneBarrier; // user event which signals when worker threads are done
-    cl_uint count; // the number of elements in the array
-    Type outType; // the data type of the conversion result
-    Type inType; // the data type of the conversion input
-    volatile int barrierCount;
-    CalcReferenceValuesInfo calcInfo[kCallStyleCount];
-} WriteInputBufferInfo;
-
-cl_uint RoundUpToNextPowerOfTwo(cl_uint x);
-cl_uint RoundUpToNextPowerOfTwo(cl_uint x)
-{
-    if (0 == (x & (x - 1))) return x;
-
-    while (x & (x - 1)) x &= x - 1;
-
-    return x + x;
-}
-
-void WriteInputBufferComplete(void *);
-
-typedef struct DataInitInfo
-{
-    cl_ulong start;
-    cl_uint size;
-    Type outType;
-    Type inType;
-    SaturationMode sat;
-    RoundingMode round;
-    MTdata *d;
-} DataInitInfo;
-
-cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p);
-cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p)
-{
-    DataInitInfo *info = (DataInitInfo *)p;
-
-    gInitFunctions[info->inType](
-        (char *)gIn + job_id * info->size * gTypeSizes[info->inType], info->sat,
-        info->round, info->outType, info->start + job_id * info->size,
-        info->size, info->d[thread_id]);
-    return CL_SUCCESS;
-}
-
-static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count)
-{
-    cl_uint i;
-    for (i = 0; i < count; ++i)
-        allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0);
-}
-
-cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p);
-cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
-{
-    DataInitInfo *info = (DataInitInfo *)p;
-    cl_uint count = info->size;
-    Type inType = info->inType;
-    Type outType = info->outType;
-    RoundingMode round = info->round;
-    size_t j;
-
-    Force64BitFPUPrecision();
-
-    void *s = (cl_uchar *)gIn + job_id * count * gTypeSizes[info->inType];
-    void *a = (cl_uchar *)gAllowZ + job_id * count;
-    void *d = (cl_uchar *)gRef + job_id * count * gTypeSizes[info->outType];
-
-    if (outType != inType)
-    {
-        // create the reference while we wait
-        Convert f = gConversions[outType][inType];
-        if (info->sat) f = gSaturatedConversions[outType][inType];
-
-#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
-        /* ARM VFP doesn't have hardware instruction for converting from 64-bit
-         * integer to float types, hence GCC ARM uses the floating-point
-         * emulation code despite which -mfloat-abi setting it is. But the
-         * emulation code in libgcc.a has only one rounding mode (round to
-         * nearest even in this case) and ignores the user rounding mode setting
-         * in hardware. As a result setting rounding modes in hardware won't
-         * give correct rounding results for type covert from 64-bit integer to
-         * float using GCC for ARM compiler so for testing different rounding
-         * modes, we need to use alternative reference function. ARM64 does have
-         * an instruction, however we cannot guarantee the compiler will use it.
-         * On all ARM architechures use emulation to calculate reference.*/
-        switch (round)
-        {
-            /* conversions to floating-point type use the current rounding mode.
-             * The only default floating-point rounding mode supported is round
-             * to nearest even i.e the current rounding mode will be _rte for
-             * floating-point types. */
-            case kDefaultRoundingMode: qcom_rm = qcomRTE; break;
-            case kRoundToNearestEven: qcom_rm = qcomRTE; break;
-            case kRoundUp: qcom_rm = qcomRTP; break;
-            case kRoundDown: qcom_rm = qcomRTN; break;
-            case kRoundTowardZero: qcom_rm = qcomRTZ; break;
-            default:
-                vlog_error("ERROR: undefined rounding mode %d\n", round);
-                break;
-        }
-        qcom_sat = info->sat;
-#endif
-
-        RoundingMode oldRound = set_round(round, outType);
-        f(d, s, count);
-        set_round(oldRound, outType);
-
-        // Decide if we allow a zero result in addition to the correctly rounded
-        // one
-        memset(a, 0, count);
-        if (gForceFTZ)
-        {
-            if (inType == kfloat) setAllowZ((uint8_t *)a, (uint32_t *)s, count);
-            if (outType == kfloat)
-                setAllowZ((uint8_t *)a, (uint32_t *)d, count);
-        }
-    }
-    else
-    {
-        // Copy the input to the reference
-        memcpy(d, s, info->size * gTypeSizes[inType]);
-    }
-
-    // Patch up NaNs conversions to integer to zero -- these can be converted to
-    // any integer
-    if (info->outType != kfloat && info->outType != kdouble)
-    {
-        if (inType == kfloat)
-        {
-            float *inp = (float *)s;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]))
-                    memset((char *)d + j * gTypeSizes[outType], 0,
-                           gTypeSizes[outType]);
-            }
-        }
-        if (inType == kdouble)
-        {
-            double *inp = (double *)s;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]))
-                    memset((char *)d + j * gTypeSizes[outType], 0,
-                           gTypeSizes[outType]);
-            }
-        }
-    }
-    else if (inType == kfloat || inType == kdouble)
-    { // outtype and intype is float or double.  NaN conversions for float <->
-      // double can be any NaN
-        if (inType == kfloat && outType == kdouble)
-        {
-            float *inp = (float *)s;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j])) ((double *)d)[j] = NAN;
-            }
-        }
-        if (inType == kdouble && outType == kfloat)
-        {
-            double *inp = (double *)s;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j])) ((float *)d)[j] = NAN;
-            }
-        }
-    }
-
-    return CL_SUCCESS;
-}
-
-static int DoTest(cl_device_id device, Type outType, Type inType,
-                  SaturationMode sat, RoundingMode round, MTdata d)
-{
-#ifdef __APPLE__
-    cl_ulong wall_start = mach_absolute_time();
-#endif
-
-    DataInitInfo init_info = { 0, 0, outType, inType, sat, round, NULL };
-    WriteInputBufferInfo writeInputBufferInfo;
-    int vectorSize;
-    int error = 0;
-    cl_uint threads = GetThreadCount();
-    uint64_t i;
-
-    gTestCount++;
-    size_t blockCount =
-        BUFFER_SIZE / std::max(gTypeSizes[inType], gTypeSizes[outType]);
-    size_t step = blockCount;
-    uint64_t lastCase = 1ULL << (8 * gTypeSizes[inType]);
-
-    memset(&writeInputBufferInfo, 0, sizeof(writeInputBufferInfo));
-    init_info.d = (MTdata *)malloc(threads * sizeof(MTdata));
-    if (NULL == init_info.d)
-    {
-        vlog_error(
-            "ERROR: Unable to allocate storage for random number generator!\n");
-        return -1;
-    }
-    for (i = 0; i < threads; i++)
-    {
-        init_info.d[i] = init_genrand(genrand_int32(d));
-        if (NULL == init_info.d[i])
-        {
-            vlog_error("ERROR: Unable to allocate storage for random number "
-                       "generator!\n");
-            return -1;
-        }
-    }
-
-    writeInputBufferInfo.outType = outType;
-    writeInputBufferInfo.inType = inType;
-
-    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
-    {
-        writeInputBufferInfo.calcInfo[vectorSize].program =
-            MakeProgram(outType, inType, sat, round, vectorSize,
-                        &writeInputBufferInfo.calcInfo[vectorSize].kernel);
-        if (NULL == writeInputBufferInfo.calcInfo[vectorSize].program)
-        {
-            gFailCount++;
-            return -1;
-        }
-        if (NULL == writeInputBufferInfo.calcInfo[vectorSize].kernel)
-        {
-            gFailCount++;
-            vlog_error("\t\tFAILED -- Failed to create kernel.\n");
-            return -2;
-        }
-
-        writeInputBufferInfo.calcInfo[vectorSize].parent =
-            &writeInputBufferInfo;
-        writeInputBufferInfo.calcInfo[vectorSize].vectorSize = vectorSize;
-        writeInputBufferInfo.calcInfo[vectorSize].result = -1;
-    }
-
-    if (gSkipTesting) goto exit;
-
-    // Patch up rounding mode if default is RTZ
-    // We leave the part above in default rounding mode so that the right kernel
-    // is compiled.
-    if (round == kDefaultRoundingMode && gIsRTZ && (outType == kfloat))
-        init_info.round = round = kRoundTowardZero;
-
-    // Figure out how many elements are in a work block
-
-    // we handle 64-bit types a bit differently.
-    if (8 * gTypeSizes[inType] > 32) lastCase = 0x100000000ULL;
-
-    if (!gWimpyMode && gIsEmbedded)
-        step = blockCount * EMBEDDED_REDUCTION_FACTOR;
-
-    if (gWimpyMode) step = (size_t)blockCount * (size_t)gWimpyReductionFactor;
-    vlog("Testing... ");
-    fflush(stdout);
-    for (i = 0; i < (uint64_t)lastCase; i += step)
-    {
-
-        if (0 == (i & ((lastCase >> 3) - 1)))
-        {
-            vlog(".");
-            fflush(stdout);
-        }
-
-        cl_uint count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
-        writeInputBufferInfo.count = count;
-
-        // Crate a user event to represent the status of the reference value
-        // computation completion
-        writeInputBufferInfo.calcReferenceValues =
-            clCreateUserEvent(gContext, &error);
-        if (error || NULL == writeInputBufferInfo.calcReferenceValues)
-        {
-            vlog_error("ERROR: Unable to create user event. (%d)\n", error);
-            gFailCount++;
-            goto exit;
-        }
-
-        // retain for consumption by MapOutputBufferComplete
-        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
-             vectorSize++)
-        {
-            if ((error =
-                     clRetainEvent(writeInputBufferInfo.calcReferenceValues)))
-            {
-                vlog_error("ERROR: Unable to retain user event. (%d)\n", error);
-                gFailCount++;
-                goto exit;
-            }
-        }
-
-        // Crate a user event to represent when the callbacks are done verifying
-        // correctness
-        writeInputBufferInfo.doneBarrier = clCreateUserEvent(gContext, &error);
-        if (error || NULL == writeInputBufferInfo.calcReferenceValues)
-        {
-            vlog_error("ERROR: Unable to create user event for barrier. (%d)\n",
-                       error);
-            gFailCount++;
-            goto exit;
-        }
-
-        // retain for use by the callback that calls this
-        if ((error = clRetainEvent(writeInputBufferInfo.doneBarrier)))
-        {
-            vlog_error("ERROR: Unable to retain user event doneBarrier. (%d)\n",
-                       error);
-            gFailCount++;
-            goto exit;
-        }
-
-        //      Call this in a multithreaded manner
-        //      gInitFunctions[ inType ]( gIn, sat, round, outType, i, count, d
-        //      );
-        cl_uint chunks = RoundUpToNextPowerOfTwo(threads) * 2;
-        init_info.start = i;
-        init_info.size = count / chunks;
-        if (init_info.size < 16384)
-        {
-            chunks = RoundUpToNextPowerOfTwo(threads);
-            init_info.size = count / chunks;
-            if (init_info.size < 16384)
-            {
-                init_info.size = count;
-                chunks = 1;
-            }
-        }
-        ThreadPool_Do(InitData, chunks, &init_info);
-
-        // Copy the results to the device
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
-                                          count * gTypeSizes[inType], gIn, 0,
-                                          NULL, NULL)))
-        {
-            vlog_error("ERROR: clEnqueueWriteBuffer failed. (%d)\n", error);
-            gFailCount++;
-            goto exit;
-        }
-
-        // Call completion callback for the write, which will enqueue the rest
-        // of the work.
-        WriteInputBufferComplete((void *)&writeInputBufferInfo);
-
-        // Make sure the work is actually running, so we don't deadlock
-        if ((error = clFlush(gQueue)))
-        {
-            vlog_error("clFlush failed with error %d\n", error);
-            gFailCount++;
-            goto exit;
-        }
-
-        ThreadPool_Do(PrepareReference, chunks, &init_info);
-
-        // signal we are done calculating the reference results
-        if ((error = clSetUserEventStatus(
-                 writeInputBufferInfo.calcReferenceValues, CL_COMPLETE)))
-        {
-            vlog_error(
-                "Error:  Failed to set user event status to CL_COMPLETE:  %d\n",
-                error);
-            gFailCount++;
-            goto exit;
-        }
-
-        // Wait for the event callbacks to finish verifying correctness.
-        if ((error = clWaitForEvents(
-                 1, (cl_event *)&writeInputBufferInfo.doneBarrier)))
-        {
-            vlog_error("Error:  Failed to wait for barrier:  %d\n", error);
-            gFailCount++;
-            goto exit;
-        }
-
-        if ((error = clReleaseEvent(writeInputBufferInfo.calcReferenceValues)))
-        {
-            vlog_error("Error:  Failed to release calcReferenceValues:  %d\n",
-                       error);
-            gFailCount++;
-            goto exit;
-        }
-
-        if ((error = clReleaseEvent(writeInputBufferInfo.doneBarrier)))
-        {
-            vlog_error("Error:  Failed to release done barrier:  %d\n", error);
-            gFailCount++;
-            goto exit;
-        }
-
-
-        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
-             vectorSize++)
-        {
-            if ((error = writeInputBufferInfo.calcInfo[vectorSize].result))
-            {
-                switch (inType)
-                {
-                    case kuchar:
-                    case kchar:
-                        vlog("Input value: 0x%2.2x ",
-                             ((unsigned char *)gIn)[error - 1]);
-                        break;
-                    case kushort:
-                    case kshort:
-                        vlog("Input value: 0x%4.4x ",
-                             ((unsigned short *)gIn)[error - 1]);
-                        break;
-                    case kuint:
-                    case kint:
-                        vlog("Input value: 0x%8.8x ",
-                             ((unsigned int *)gIn)[error - 1]);
-                        break;
-                    case kfloat:
-                        vlog("Input value: %a ", ((float *)gIn)[error - 1]);
-                        break;
-                        break;
-                    case kulong:
-                    case klong:
-                        vlog("Input value: 0x%16.16llx ",
-                             ((unsigned long long *)gIn)[error - 1]);
-                        break;
-                    case kdouble:
-                        vlog("Input value: %a ", ((double *)gIn)[error - 1]);
-                        break;
-                    default:
-                        vlog_error("Internal error at %s: %d\n", __FILE__,
-                                   __LINE__);
-                        abort();
-                        break;
-                }
-
-                // tell the user which conversion it was.
-                if (0 == vectorSize)
-                    vlog(" (implicit scalar conversion from %s to %s)\n",
-                         gTypeNames[inType], gTypeNames[outType]);
-                else
-                    vlog(" (convert_%s%s%s%s( %s%s ))\n", gTypeNames[outType],
-                         sizeNames[vectorSize], gSaturationNames[sat],
-                         gRoundingModeNames[round], gTypeNames[inType],
-                         sizeNames[vectorSize]);
-
-                gFailCount++;
-                goto exit;
-            }
-        }
-    }
-
-    log_info("done.\n");
-
-    if (gTimeResults)
-    {
-        // Kick off tests for the various vector lengths
-        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
-             vectorSize++)
-        {
-            size_t workItemCount = blockCount / vectorSizes[vectorSize];
-            if (vectorSizes[vectorSize] * gTypeSizes[outType] < 4)
-                workItemCount /=
-                    4 / (vectorSizes[vectorSize] * gTypeSizes[outType]);
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            cl_uint k;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = RunKernel(
-                         writeInputBufferInfo.calcInfo[vectorSize].kernel,
-                         gInBuffer, gOutBuffers[vectorSize], workItemCount)))
-                {
-                    gFailCount++;
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (workItemCount * vectorSizes[vectorSize]);
-            if (0 == vectorSize)
-                vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element",
-                          "implicit convert %s -> %s", gTypeNames[inType],
-                          gTypeNames[outType]);
-            else
-                vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element",
-                          "convert_%s%s%s%s( %s%s )", gTypeNames[outType],
-                          sizeNames[vectorSize], gSaturationNames[sat],
-                          gRoundingModeNames[round], gTypeNames[inType],
-                          sizeNames[vectorSize]);
-        }
-    }
-
-    if (gWimpyMode)
-        vlog("\tWimp pass");
-    else
-        vlog("\tpassed");
-
-#ifdef __APPLE__
-    // record the run time
-    vlog("\t(%f s)", 1e-9 * (mach_absolute_time() - wall_start));
-#endif
-    vlog("\n\n");
-    fflush(stdout);
-
-
-exit:
-    // clean up
-    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
-    {
-        clReleaseProgram(writeInputBufferInfo.calcInfo[vectorSize].program);
-        clReleaseKernel(writeInputBufferInfo.calcInfo[vectorSize].kernel);
-    }
-
-    if (init_info.d)
-    {
-        for (i = 0; i < threads; i++) free_mtdata(init_info.d[i]);
-        free(init_info.d);
-    }
-
-    return error;
-}
-
-void MapResultValuesComplete(void *data);
-
-// Note: not called reentrantly
-void WriteInputBufferComplete(void *data)
-{
-    cl_int status;
-    WriteInputBufferInfo *info = (WriteInputBufferInfo *)data;
-    cl_uint count = info->count;
-    int vectorSize;
-
-    info->barrierCount = gMaxVectorSize - gMinVectorSize;
-
-    // now that we know that the write buffer is complete, enqueue callbacks to
-    // wait for the main thread to finish calculating the reference results.
-    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
-    {
-        size_t workItemCount =
-            (count + vectorSizes[vectorSize] - 1) / (vectorSizes[vectorSize]);
-
-        if ((status = RunKernel(info->calcInfo[vectorSize].kernel, gInBuffer,
-                                gOutBuffers[vectorSize], workItemCount)))
-        {
-            gFailCount++;
-            return;
-        }
-
-        info->calcInfo[vectorSize].p = clEnqueueMapBuffer(
-            gQueue, gOutBuffers[vectorSize], CL_TRUE,
-            CL_MAP_READ | CL_MAP_WRITE, 0, count * gTypeSizes[info->outType], 0,
-            NULL, NULL, &status);
-        {
-            if (status)
-            {
-                vlog_error("ERROR: WriteInputBufferComplete calback failed "
-                           "with status: %d\n",
-                           status);
-                gFailCount++;
-                return;
-            }
-        }
-    }
-
-    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
-    {
-        MapResultValuesComplete(info->calcInfo + vectorSize);
-    }
-
-    // Make sure the work starts moving -- otherwise we may deadlock
-    if ((status = clFlush(gQueue)))
-    {
-        vlog_error(
-            "ERROR: WriteInputBufferComplete calback failed with status: %d\n",
-            status);
-        gFailCount++;
-        return;
-    }
-
-    // e was already released by the main thread. It should be destroyed
-    // automatically soon after we exit.
-}
-
-void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
-                                             void *data);
-
-// Note: May be called reentrantly
-void MapResultValuesComplete(void *data)
-{
-    cl_int status;
-    CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo *)data;
-    cl_event calcReferenceValues = info->parent->calcReferenceValues;
-
-    // we know that the map is done, wait for the main thread to finish
-    // calculating the reference values
-    if ((status = clSetEventCallback(calcReferenceValues, CL_COMPLETE,
-                                     CalcReferenceValuesComplete, data)))
-    {
-        vlog_error("ERROR: clSetEventCallback failed in "
-                   "MapResultValuesComplete with status: %d\n",
-                   status);
-        gFailCount++; // not thread safe -- being lazy here
-    }
-
-    // this thread no longer needs its reference to info->calcReferenceValues,
-    // so release it
-    if ((status = clReleaseEvent(calcReferenceValues)))
-    {
-        vlog_error("ERROR: clReleaseEvent(info->calcReferenceValues) failed "
-                   "with status: %d\n",
-                   status);
-        gFailCount++; // not thread safe -- being lazy here
-    }
-
-    // no need to flush since we didn't enqueue anything
-
-    // e was already released by WriteInputBufferComplete. It should be
-    // destroyed automatically soon after we exit.
-}
-
-
-void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
-                                             void *data)
-{
-    CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo *)data;
-    cl_uint vectorSize = info->vectorSize;
-    cl_uint count = info->parent->count;
-    Type outType =
-        info->parent->outType; // the data type of the conversion result
-    Type inType = info->parent->inType; // the data type of the conversion input
-    size_t j;
-    cl_int error;
-    cl_event doneBarrier = info->parent->doneBarrier;
-
-    // report spurious error condition
-    if (CL_SUCCESS != status)
-    {
-        vlog_error("ERROR: CalcReferenceValuesComplete did not succeed! (%d)\n",
-                   status);
-        gFailCount++; // lazy about thread safety here
-        return;
-    }
-
-    // Now we know that both results have been mapped back from the device, and
-    // the main thread is done calculating the reference results. It is now time
-    // to check the results.
-
-    // verify results
-    void *mapped = info->p;
-
-    // Patch up NaNs conversions to integer to zero -- these can be converted to
-    // any integer
-    if (outType != kfloat && outType != kdouble)
-    {
-        if (inType == kfloat)
-        {
-            float *inp = (float *)gIn;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]))
-                    memset((char *)mapped + j * gTypeSizes[outType], 0,
-                           gTypeSizes[outType]);
-            }
-        }
-        if (inType == kdouble)
-        {
-            double *inp = (double *)gIn;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]))
-                    memset((char *)mapped + j * gTypeSizes[outType], 0,
-                           gTypeSizes[outType]);
-            }
-        }
-    }
-    else if (inType == kfloat || inType == kdouble)
-    { // outtype and intype is float or double.  NaN conversions for float <->
-      // double can be any NaN
-        if (inType == kfloat && outType == kdouble)
-        {
-            float *inp = (float *)gIn;
-            double *outp = (double *)mapped;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
-            }
-        }
-        if (inType == kdouble && outType == kfloat)
-        {
-            double *inp = (double *)gIn;
-            float *outp = (float *)mapped;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
-            }
-        }
-    }
-
-    if (memcmp(mapped, gRef, count * gTypeSizes[outType]))
-        info->result = gCheckResults[outType](mapped, gRef, gAllowZ, count,
-                                              vectorSizes[vectorSize]);
-    else
-        info->result = 0;
-
-    // Fill the output buffer with junk and release it
-    {
-        cl_uint pattern = 0xffffdead;
-        memset_pattern4(mapped, &pattern, count * gTypeSizes[outType]);
-        if ((error = clEnqueueUnmapMemObject(gQueue, gOutBuffers[vectorSize],
-                                             mapped, 0, NULL, NULL)))
-        {
-            vlog_error("ERROR: clEnqueueUnmapMemObject failed in "
-                       "CalcReferenceValuesComplete  (%d)\n",
-                       error);
-            gFailCount++;
-        }
-    }
-
-    if (1 == ThreadPool_AtomicAdd(&info->parent->barrierCount, -1))
-    {
-        if ((status = clSetUserEventStatus(doneBarrier, CL_COMPLETE)))
-        {
-            vlog_error("ERROR: clSetUserEventStatus failed in "
-                       "CalcReferenceValuesComplete (err: %d). We're probably "
-                       "going to deadlock.\n",
-                       status);
-            gFailCount++;
-            return;
-        }
-
-        if ((status = clReleaseEvent(doneBarrier)))
-        {
-            vlog_error("ERROR: clReleaseEvent failed in "
-                       "CalcReferenceValuesComplete (err: %d).\n",
-                       status);
-            gFailCount++;
-            return;
-        }
-    }
-    // e was already released by WriteInputBufferComplete. It should be
-    // destroyed automatically soon after all the calls to
-    // CalcReferenceValuesComplete exit.
-}
-
-static cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
-                              RoundingMode round, int vectorSize,
-                              cl_kernel *outKernel)
-{
-    cl_program program;
-    char testName[256];
-    int error = 0;
-
-    std::ostringstream source;
-    if (outType == kdouble || inType == kdouble)
-        source << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
-
-    // Create the program. This is a bit complicated because we are trying to
-    // avoid byte and short stores.
-    if (0 == vectorSize)
-    {
-        // Create the type names.
-        char inName[32];
-        char outName[32];
-        strncpy(inName, gTypeNames[inType], sizeof(inName));
-        strncpy(outName, gTypeNames[outType], sizeof(outName));
-        sprintf(testName, "test_implicit_%s_%s", outName, inName);
-
-        source << "__kernel void " << testName << "( __global " << inName
-               << " *src, __global " << outName << " *dest )\n";
-        source << "{\n";
-        source << "   size_t i = get_global_id(0);\n";
-        source << "   dest[i] =  src[i];\n";
-        source << "}\n";
-
-        vlog("Building implicit %s -> %s conversion test\n", gTypeNames[inType],
-             gTypeNames[outType]);
-        fflush(stdout);
-    }
-    else
-    {
-        int vectorSizetmp = vectorSizes[vectorSize];
-
-        // Create the type names.
-        char convertString[128];
-        char inName[32];
-        char outName[32];
-        switch (vectorSizetmp)
-        {
-            case 1:
-                strncpy(inName, gTypeNames[inType], sizeof(inName));
-                strncpy(outName, gTypeNames[outType], sizeof(outName));
-                snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
-                         outName, gSaturationNames[sat],
-                         gRoundingModeNames[round]);
-                snprintf(testName, 256, "test_%s_%s", convertString, inName);
-                vlog("Building %s( %s ) test\n", convertString, inName);
-                break;
-            case 3:
-                strncpy(inName, gTypeNames[inType], sizeof(inName));
-                strncpy(outName, gTypeNames[outType], sizeof(outName));
-                snprintf(convertString, sizeof(convertString),
-                         "convert_%s3%s%s", outName, gSaturationNames[sat],
-                         gRoundingModeNames[round]);
-                snprintf(testName, 256, "test_%s_%s3", convertString, inName);
-                vlog("Building %s( %s3 ) test\n", convertString, inName);
-                break;
-            default:
-                snprintf(inName, sizeof(inName), "%s%d", gTypeNames[inType],
-                         vectorSizetmp);
-                snprintf(outName, sizeof(outName), "%s%d", gTypeNames[outType],
-                         vectorSizetmp);
-                snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
-                         outName, gSaturationNames[sat],
-                         gRoundingModeNames[round]);
-                snprintf(testName, 256, "test_%s_%s", convertString, inName);
-                vlog("Building %s( %s ) test\n", convertString, inName);
-                break;
-        }
-        fflush(stdout);
-
-        if (vectorSizetmp == 3)
-        {
-            source << "__kernel void " << testName << "( __global " << inName
-                   << " *src, __global " << outName << " *dest )\n";
-            source << "{\n";
-            source << "   size_t i = get_global_id(0);\n";
-            source << "   if( i + 1 < get_global_size(0))\n";
-            source << "       vstore3( " << convertString
-                   << "( vload3( i, src)), i, dest );\n";
-            source << "   else\n";
-            source << "   {\n";
-            source << "       " << inName << "3 in;\n";
-            source << "       " << outName << "3 out;\n";
-            source << "       if( 0 == (i & 1) )\n";
-            source << "           in.y = src[3*i+1];\n";
-            source << "       in.x = src[3*i];\n";
-            source << "       out = " << convertString << "( in ); \n";
-            source << "       dest[3*i] = out.x;\n";
-            source << "       if( 0 == (i & 1) )\n";
-            source << "           dest[3*i+1] = out.y;\n";
-            source << "   }\n";
-            source << "}\n";
-        }
-        else
-        {
-            source << "__kernel void " << testName << "( __global " << inName
-                   << " *src, __global " << outName << " *dest )\n";
-            source << "{\n";
-            source << "   size_t i = get_global_id(0);\n";
-            source << "   dest[i] = " << convertString << "( src[i] );\n";
-            source << "}\n";
-        }
-    }
-    *outKernel = NULL;
-
-    const char *flags = NULL;
-    if (gForceFTZ) flags = "-cl-denorms-are-zero";
-
-    // build it
-    std::string sourceString = source.str();
-    const char *programSource = sourceString.c_str();
-    error = create_single_kernel_helper(gContext, &program, outKernel, 1,
-                                        &programSource, testName, flags);
-    if (error)
-    {
-        vlog_error("Failed to build kernel/program (err = %d).\n", error);
-        clReleaseProgram(program);
-        return NULL;
-    }
 
-    return program;
-}
diff --git a/test_conformance/d3d10/harness.h b/test_conformance/d3d10/harness.h
index 184e52c..afeb496 100644
--- a/test_conformance/d3d10/harness.h
+++ b/test_conformance/d3d10/harness.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _HARNESS_H_
-#define _HARNESS_H_
+#ifndef HARNESS_H_
+#define HARNESS_H_
 
 #define _CRT_SECURE_NO_WARNINGS
 
diff --git a/test_conformance/device_execution/CMakeLists.txt b/test_conformance/device_execution/CMakeLists.txt
index 5e9e30e..275b96c 100644
--- a/test_conformance/device_execution/CMakeLists.txt
+++ b/test_conformance/device_execution/CMakeLists.txt
@@ -17,6 +17,6 @@
     utils.cpp
 )
 
-include(../CMakeCommon.txt)
+set_gnulike_module_compile_flags("-Wno-sign-compare")
 
-# end of file #
+include(../CMakeCommon.txt)
diff --git a/test_conformance/device_execution/enqueue_block.cpp b/test_conformance/device_execution/enqueue_block.cpp
index 29a6cec..4ddd1db 100644
--- a/test_conformance/device_execution/enqueue_block.cpp
+++ b/test_conformance/device_execution/enqueue_block.cpp
@@ -27,561 +27,538 @@
 
 #ifdef CL_VERSION_2_0
 extern int gWimpyMode;
-static const char* enqueue_simple_block[] =
-{
-    NL, "void block_fn(size_t tid, int mul, __global int* res)"
-    NL, "{"
-    NL, "  res[tid] = mul * 7 - 21;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_simple_block(__global int* res)"
-    NL, "{"
-    NL, "  int multiplier = 3;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };"
-    NL, ""
-    NL, "  res[tid] = -1;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-    NL
-};
 
-static const char* enqueue_block_with_local_arg1[] =
-{
-    NL, "#define LOCAL_MEM_SIZE 10"
-    NL, ""
-    NL, "void block_fn_local_arg1(size_t tid, int mul, __global int* res, __local int* tmp)"
-    NL, "{"
-    NL, "  for(int i = 0; i < LOCAL_MEM_SIZE; i++)"
-    NL, "  {"
-    NL, "    tmp[i] = mul * 7 - 21;"
-    NL, "    res[tid] += tmp[i];"
-    NL, "  }"
-    NL, "  res[tid] += 2;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_with_local_arg1(__global int* res)"
-    NL, "{"
-    NL, "  int multiplier = 3;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  void (^kernelBlock)(__local void*) = ^(__local void* buf){ block_fn_local_arg1(tid, multiplier, res, (local int*)buf); };"
-    NL, ""
-    NL, "  res[tid] = -2;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock, (uint)(LOCAL_MEM_SIZE*sizeof(int)));"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-    NL
-};
+// clang-format off
+static const char* enqueue_simple_block[] = { R"(
+    void block_fn(size_t tid, int mul, __global int* res)
+    {
+      res[tid] = mul * 7 - 21;
+    }
 
-static const char* enqueue_block_with_local_arg2[] =
-{
-    NL, "#define LOCAL_MEM_SIZE 10"
-    NL, ""
-    NL, "void block_fn_local_arg1(size_t tid, int mul, __global int* res, __local int* tmp1, __local float4* tmp2)"
-    NL, "{"
-    NL, "  for(int i = 0; i < LOCAL_MEM_SIZE; i++)"
-    NL, "  {"
-    NL, "    tmp1[i]   = mul * 7 - 21;"
-    NL, "    tmp2[i].x = (float)(mul * 7 - 21);"
-    NL, "    tmp2[i].y = (float)(mul * 7 - 21);"
-    NL, "    tmp2[i].z = (float)(mul * 7 - 21);"
-    NL, "    tmp2[i].w = (float)(mul * 7 - 21);"
-    NL, ""
-    NL, "    res[tid] += tmp1[i];"
-    NL, "    res[tid] += (int)(tmp2[i].x+tmp2[i].y+tmp2[i].z+tmp2[i].w);"
-    NL, "  }"
-    NL, "  res[tid] += 2;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_with_local_arg2(__global int* res)"
-    NL, "{"
-    NL, "  int multiplier = 3;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  void (^kernelBlock)(__local void*, __local void*) = ^(__local void* buf1, __local void* buf2)"
-    NL, "    { block_fn_local_arg1(tid, multiplier, res, (local int*)buf1, (local float4*)buf2); };"
-    NL, ""
-    NL, "  res[tid] = -2;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock, (uint)(LOCAL_MEM_SIZE*sizeof(int)), (uint)(LOCAL_MEM_SIZE*sizeof(float4)));"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-    NL
-};
+    kernel void enqueue_simple_block(__global int* res)
+    {
+      int multiplier = 3;
+      size_t tid = get_global_id(0);
 
-static const char* enqueue_block_with_wait_list[] =
-{
-    NL, "#define BLOCK_SUBMITTED 1"
-    NL, "#define BLOCK_COMPLETED 2"
-    NL, "#define CHECK_SUCCESS   0"
-    NL, ""
-    NL, "kernel void enqueue_block_with_wait_list(__global int* res)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  clk_event_t user_evt = create_user_event();"
-    NL, ""
-    NL, "  res[tid] = BLOCK_SUBMITTED;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  clk_event_t block_evt;"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt,"
-    NL, "  ^{"
-    NL, "      res[tid] = BLOCK_COMPLETED;"
-    NL, "   });"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "  retain_event(block_evt);"
-    NL, "  release_event(block_evt);"
-    NL, ""
-    NL, "  //check block is not started"
-    NL, "  if(res[tid] == BLOCK_SUBMITTED)"
-    NL, "  {"
-    NL, "    clk_event_t my_evt;"
-    NL, "    enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt, &my_evt, "
-    NL, "    ^{"
-    NL, "       //check block is completed"
-    NL, "       if(res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;"
-    NL, "     });"
-    NL, "    release_event(my_evt);"
-    NL, "  }"
-    NL, ""
-    NL, "  set_user_event_status(user_evt, CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(user_evt);"
-    NL, "  release_event(block_evt);"
-    NL, "}"
-    NL
-};
+      void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };
 
-static const char* enqueue_block_with_wait_list_and_local_arg[] =
-{
-    NL, "#define LOCAL_MEM_SIZE 10"
-    NL, "#define BLOCK_COMPLETED 1"
-    NL, "#define BLOCK_SUBMITTED 2"
-    NL, "#define BLOCK_STARTED   3"
-    NL, "#define CHECK_SUCCESS   0"
-    NL, ""
-    NL, "void block_fn_local_arg(size_t tid, int mul, __global int* res, __local int* tmp)"
-    NL, "{"
-    NL, "  res[tid] = BLOCK_STARTED;"
-    NL, "  for(int i = 0; i < LOCAL_MEM_SIZE; i++)"
-    NL, "  {"
-    NL, "    tmp[i] = mul * 7 - 21;"
-    NL, "    res[tid] += tmp[i];"
-    NL, "  }"
-    NL, "  if(res[tid] == BLOCK_STARTED) res[tid] = BLOCK_COMPLETED;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_with_wait_list_and_local_arg(__global int* res)"
-    NL, "{"
-    NL, "  int multiplier = 3;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  clk_event_t user_evt = create_user_event();"
-    NL, ""
-    NL, "  res[tid] = BLOCK_SUBMITTED;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  clk_event_t block_evt;"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt, "
-    NL, "    ^(__local void* buf) {"
-    NL, "       block_fn_local_arg(tid, multiplier, res, (__local int*)buf);"
-    NL, "     }, LOCAL_MEM_SIZE*sizeof(int));"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "  retain_event(block_evt);"
-    NL, "  release_event(block_evt);"
-    NL, ""
-    NL, "  //check block is not started"
-    NL, "  if(res[tid] == BLOCK_SUBMITTED)"
-    NL, "  {"
-    NL, "    clk_event_t my_evt;"
-    NL, "    enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt, &my_evt, "
-    NL, "    ^{"
-    NL, "       //check block is completed"
-    NL, "       if(res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;"
-    NL, "     });"
-    NL, "    release_event(my_evt);"
-    NL, "  }"
-    NL, ""
-    NL, "  set_user_event_status(user_evt, CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(user_evt);"
-    NL, "  release_event(block_evt);"
-    NL, "}"
-    NL
-};
+      res[tid] = -1;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
 
-static const char* enqueue_block_get_kernel_work_group_size[] =
-{
-    NL, "void block_fn(size_t tid, int mul, __global int* res)"
-    NL, "{"
-    NL, "  res[tid] = mul * 7 - 21;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_get_kernel_work_group_size(__global int* res)"
-    NL, "{"
-    NL, "    int multiplier = 3;"
-    NL, "    size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "    void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };"
-    NL, ""
-    NL, "    size_t local_work_size = get_kernel_work_group_size(kernelBlock);"
-    NL, "    if (local_work_size <= 0){ res[tid] = -1; return; }"
-    NL, "    size_t global_work_size = local_work_size * 4;"
-    NL, ""
-    NL, "    res[tid] = -1;"
-    NL, "    queue_t q1 = get_default_queue();"
-    NL, "    ndrange_t ndrange = ndrange_1D(global_work_size, local_work_size);"
-    NL, ""
-    NL, "    int enq_res = enqueue_kernel(q1, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-};
+static const char* enqueue_block_with_local_arg1[] = { R"(
+    #define LOCAL_MEM_SIZE 10
 
-static const char* enqueue_block_get_kernel_preferred_work_group_size_multiple[] =
-{
-    NL, "void block_fn(size_t tid, int mul, __global int* res)"
-    NL, "{"
-    NL, "  res[tid] = mul * 7 - 21;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_get_kernel_preferred_work_group_size_multiple(__global int* res)"
-    NL, "{"
-    NL, "    int multiplier = 3;"
-    NL, "    size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "    void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };"
-    NL, ""
-    NL, "    size_t local_work_size = get_kernel_preferred_work_group_size_multiple(kernelBlock);"
-    NL, "    if (local_work_size <= 0){ res[tid] = -1; return; }"
-    NL, "    size_t global_work_size = local_work_size * 4;"
-    NL, ""
-    NL, "    res[tid] = -1;"
-    NL, "    queue_t q1 = get_default_queue();"
-    NL, "    ndrange_t ndrange = ndrange_1D(global_work_size, local_work_size);"
-    NL, ""
-    NL, "    int enq_res = enqueue_kernel(q1, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-};
+    void block_fn_local_arg1(size_t tid, int mul, __global int* res, __local int* tmp)
+    {
+      for (int i = 0; i < LOCAL_MEM_SIZE; i++)
+      {
+        tmp[i] = mul * 7 - 21;
+        res[tid] += tmp[i];
+      }
+      res[tid] += 2;
+    }
 
-static const char* enqueue_block_capture_event_profiling_info_after_execution[] =
-{
-    NL, "#define MAX_GWS " STRINGIFY_VALUE(MAX_GWS)
-    NL, ""
-    NL, "__global ulong value[MAX_GWS*2] = {0};"
-    NL, ""
-    NL, "void block_fn(size_t tid, __global int* res)"
-    NL, "{"
-    NL, "    res[tid] = -2;"
-    NL, "}"
-    NL, ""
-    NL, "void check_res(size_t tid, const clk_event_t evt, __global int* res)"
-    NL, "{"
-    NL, "    capture_event_profiling_info (evt, CLK_PROFILING_COMMAND_EXEC_TIME, &value[tid*2]);"
-    NL, ""
-    NL, "    if (value[tid*2] > 0 && value[tid*2+1] > 0) res[tid] =  0;"
-    NL, "    else                                        res[tid] = -4;"
-    NL, "    release_event(evt);"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_capture_event_profiling_info_after_execution(__global int* res)"
-    NL, "{"
-    NL, "    size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "    res[tid] = -1;"
-    NL, "    queue_t def_q = get_default_queue();"
-    NL, "    ndrange_t ndrange = ndrange_1D(1);"
-    NL, "    clk_event_t block_evt1;"
-    NL, ""
-    NL, "    void (^kernelBlock)(void)  = ^{ block_fn (tid, res);                   };"
-    NL, ""
-    NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 0, NULL, &block_evt1, kernelBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "    void (^checkBlock) (void)  = ^{ check_res(tid, block_evt1, res);      };"
-    NL, ""
-    NL, "    enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt1, NULL, checkBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -3; return; }"
-    NL, "}"
-    NL
-};
+    kernel void enqueue_block_with_local_arg1(__global int* res)
+    {
+      int multiplier = 3;
+      size_t tid = get_global_id(0);
 
-static const char* enqueue_block_capture_event_profiling_info_before_execution[] =
-{
-    NL, "#define MAX_GWS " STRINGIFY_VALUE(MAX_GWS)
-    NL, ""
-    NL, "__global ulong value[MAX_GWS*2] = {0};"
-    NL, ""
-    NL, "void block_fn(size_t tid, __global int* res)"
-    NL, "{"
-    NL, "    res[tid] = -2;"
-    NL, "}"
-    NL, ""
-    NL, "void check_res(size_t tid, const ulong *value, __global int* res)"
-    NL, "{"
-    NL, "    if (value[tid*2] > 0 && value[tid*2+1] > 0) res[tid] =  0;"
-    NL, "    else                                        res[tid] = -4;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_capture_event_profiling_info_before_execution(__global int* res)"
-    NL, "{"
-    NL, "    int multiplier = 3;"
-    NL, "    size_t tid = get_global_id(0);"
-    NL, "    clk_event_t user_evt = create_user_event();"
-    NL, ""
-    NL, "    res[tid] = -1;"
-    NL, "    queue_t def_q = get_default_queue();"
-    NL, "    ndrange_t ndrange = ndrange_1D(1);"
-    NL, "    clk_event_t block_evt1;"
-    NL, "    clk_event_t block_evt2;"
-    NL, ""
-    NL, "    void (^kernelBlock)(void)  = ^{ block_fn (tid, res);                   };"
-    NL, ""
-    NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt1, kernelBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "    capture_event_profiling_info (block_evt1, CLK_PROFILING_COMMAND_EXEC_TIME, &value[tid*2]);"
-    NL, ""
-    NL, "    set_user_event_status(user_evt, CL_COMPLETE);"
-    NL, ""
-    NL, "    void (^checkBlock) (void)  = ^{ check_res(tid, &value, res);      };"
-    NL, ""
-    NL, "    enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt1, &block_evt2, checkBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -3; return; }"
-    NL, ""
-    NL, "    release_event(user_evt);"
-    NL, "    release_event(block_evt1);"
-    NL, "    release_event(block_evt2);"
-    NL, "}"
-    NL
-};
+      void (^kernelBlock)(__local void*) = ^(__local void* buf){ block_fn_local_arg1(tid, multiplier, res, (local int*)buf); };
 
-static const char* enqueue_block_with_barrier[] =
-{
-    NL, "void block_fn(size_t tid, int mul, __global int* res)"
-    NL, "{"
-    NL, "  if(mul > 0) barrier(CLK_GLOBAL_MEM_FENCE);"
-    NL, "  res[tid] = mul * 7 -21;"
-    NL, "}"
-    NL, ""
-    NL, "void loop_fn(size_t tid, int n, __global int* res)"
-    NL, "{"
-    NL, "  while(n > 0)"
-    NL, "  {"
-    NL, "    barrier(CLK_GLOBAL_MEM_FENCE);"
-    NL, "    res[tid] = 0;"
-    NL, "    --n;"
-    NL, "  }"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_with_barrier(__global int* res)"
-    NL, "{"
-    NL, "  int multiplier = 3;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  res[tid] = -1;"
-    NL, "  size_t n = 256;"
-    NL, ""
-    NL, "  void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };"
-    NL, ""
-    NL, "  ndrange_t ndrange = ndrange_1D(n);"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "  void (^loopBlock)(void) = ^{ loop_fn(tid, n, res); };"
-    NL, ""
-    NL, "  enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, loopBlock);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-    NL
-};
+      res[tid] = -2;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock, (uint)(LOCAL_MEM_SIZE*sizeof(int)));
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
 
-static const char* enqueue_marker_with_block_event[] =
-{
-    NL, "#define BLOCK_COMPLETED 1"
-    NL, "#define BLOCK_SUBMITTED 2"
-    NL, "#define CHECK_SUCCESS   0"
-    NL, ""
-    NL, "kernel void enqueue_marker_with_block_event(__global int* res)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  clk_event_t user_evt = create_user_event();"
-    NL, ""
-    NL, "  res[tid] = BLOCK_SUBMITTED;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, ""
-    NL, "  clk_event_t block_evt1;"
-    NL, "  clk_event_t marker_evt;"
-    NL, ""
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt1,"
-    NL, "  ^{"
-    NL, "     res[tid] = BLOCK_COMPLETED;"
-    NL, "   });"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -2; return; }"
-    NL, ""
-    NL, "  enq_res = enqueue_marker(def_q, 1, &block_evt1, &marker_evt);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -3; return; }"
-    NL, ""
-    NL, "  retain_event(marker_evt);"
-    NL, "  release_event(marker_evt);"
-    NL, ""
-    NL, "  //check block is not started"
-    NL, "  if(res[tid] == BLOCK_SUBMITTED)"
-    NL, "  {"
-    NL, "    clk_event_t my_evt;"
-    NL, "    enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &my_evt, "
-    NL, "    ^{"
-    NL, "       //check block is completed"
-    NL, "       if(res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;"
-    NL, "     });"
-    NL, "    release_event(my_evt);"
-    NL, "  }"
-    NL, ""
-    NL, "  set_user_event_status(user_evt, CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(block_evt1);"
-    NL, "  release_event(marker_evt);"
-    NL, "  release_event(user_evt);"
-    NL, "}"
-    NL
-};
+static const char* enqueue_block_with_local_arg2[] = { R"(
+    #define LOCAL_MEM_SIZE 10
 
-static const char* enqueue_marker_with_user_event[] =
-{
-    NL, "#define BLOCK_COMPLETED 1"
-    NL, "#define BLOCK_SUBMITTED 2"
-    NL, "#define CHECK_SUCCESS   0"
-    NL, ""
-    NL, "kernel void enqueue_marker_with_user_event(__global int* res)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  uint multiplier = 7;"
-    NL, ""
-    NL, "  clk_event_t user_evt = create_user_event();"
-    NL, ""
-    NL, "  res[tid] = BLOCK_SUBMITTED;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, ""
-    NL, "  clk_event_t marker_evt;"
-    NL, "  clk_event_t block_evt;"
-    NL, ""
-    NL, "  int enq_res = enqueue_marker(def_q, 1, &user_evt, &marker_evt);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "  retain_event(marker_evt);"
-    NL, "  release_event(marker_evt);"
-    NL, ""
-    NL, "  enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &block_evt, "
-    NL, "  ^{"
-    NL, "     if(res[tid] == BLOCK_SUBMITTED) res[tid] = CHECK_SUCCESS;"
-    NL, "   });"
-    NL, ""
-    NL, "  //check block is not started"
-    NL, "  if(res[tid] != BLOCK_SUBMITTED)  { res[tid] = -2; return; }"
-    NL, ""
-    NL, "  set_user_event_status(user_evt, CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(block_evt);"
-    NL, "  release_event(marker_evt);"
-    NL, "  release_event(user_evt);"
-    NL, "}"
-    NL
-};
+    void block_fn_local_arg1(size_t tid, int mul, __global int* res, __local int* tmp1, __local float4* tmp2)
+    {
+      for (int i = 0; i < LOCAL_MEM_SIZE; i++)
+      {
+        tmp1[i]   = mul * 7 - 21;
+        tmp2[i].x = (float)(mul * 7 - 21);
+        tmp2[i].y = (float)(mul * 7 - 21);
+        tmp2[i].z = (float)(mul * 7 - 21);
+        tmp2[i].w = (float)(mul * 7 - 21);
 
-static const char* enqueue_marker_with_mixed_events[] =
-{
-    NL, "#define BLOCK_COMPLETED 1"
-    NL, "#define BLOCK_SUBMITTED 2"
-    NL, "#define CHECK_SUCCESS   0"
-    NL, ""
-    NL, "kernel void enqueue_marker_with_mixed_events(__global int* res)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  clk_event_t mix_ev[2];"
-    NL, "  mix_ev[0] = create_user_event();"
-    NL, ""
-    NL, "  res[tid] = BLOCK_SUBMITTED;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, ""
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &mix_ev[0], &mix_ev[1],"
-    NL, "  ^{"
-    NL, "     res[tid] = BLOCK_COMPLETED;"
-    NL, "   });"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -2; return; }"
-    NL, ""
-    NL, "  clk_event_t marker_evt;"
-    NL, ""
-    NL, "  enq_res = enqueue_marker(def_q, 2, mix_ev, &marker_evt);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -3; return; }"
-    NL, ""
-    NL, "  retain_event(marker_evt);"
-    NL, "  release_event(marker_evt);"
-    NL, ""
-    NL, "  //check block is not started"
-    NL, "  if(res[tid] == BLOCK_SUBMITTED)"
-    NL, "  {"
-    NL, "    clk_event_t my_evt;"
-    NL, "    enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &my_evt, "
-    NL, "    ^{"
-    NL, "       //check block is completed"
-    NL, "       if(res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;"
-    NL, "     });"
-    NL, "    release_event(my_evt);"
-    NL, "  }"
-    NL, ""
-    NL, "  set_user_event_status(mix_ev[0], CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(mix_ev[1]);"
-    NL, "  release_event(marker_evt);"
-    NL, "  release_event(mix_ev[0]);"
-    NL, "}"
-    NL
-};
+        res[tid] += tmp1[i];
+        res[tid] += (int)(tmp2[i].x+tmp2[i].y+tmp2[i].z+tmp2[i].w);
+      }
+      res[tid] += 2;
+    }
 
-static const char* enqueue_block_with_mixed_events[] =
-{
-    NL, "kernel void enqueue_block_with_mixed_events(__global int* res)"
-    NL, "{"
-    NL, "  int enq_res;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  clk_event_t mix_ev[3];"
-    NL, "  mix_ev[0] = create_user_event();"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  res[tid] = -2;"
-    NL, ""
-    NL, "  enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &mix_ev[0], &mix_ev[1], ^{ res[tid]++; });"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "  enq_res = enqueue_marker(def_q, 1, &mix_ev[1], &mix_ev[2]);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -3; return; }"
-    NL, ""
-    NL, "  enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, sizeof(mix_ev)/sizeof(mix_ev[0]), mix_ev, NULL, ^{ res[tid]++; });"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -4; return; }"
-    NL, ""
-    NL, "  set_user_event_status(mix_ev[0], CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(mix_ev[0]);"
-    NL, "  release_event(mix_ev[1]);"
-    NL, "  release_event(mix_ev[2]);"
-    NL, "}"
-    NL
-};
+    kernel void enqueue_block_with_local_arg2(__global int* res)
+    {
+      int multiplier = 3;
+      size_t tid = get_global_id(0);
+
+      void (^kernelBlock)(__local void*, __local void*) = ^(__local void* buf1, __local void* buf2)
+        { block_fn_local_arg1(tid, multiplier, res, (local int*)buf1, (local float4*)buf2); };
+
+      res[tid] = -2;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock, (uint)(LOCAL_MEM_SIZE*sizeof(int)), (uint)(LOCAL_MEM_SIZE*sizeof(float4)));
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
+
+static const char* enqueue_block_with_wait_list[] = { R"(
+    #define BLOCK_SUBMITTED 1
+    #define BLOCK_COMPLETED 2
+    #define CHECK_SUCCESS   0
+
+    kernel void enqueue_block_with_wait_list(__global int* res)
+    {
+      size_t tid = get_global_id(0);
+
+      clk_event_t user_evt = create_user_event();
+
+      res[tid] = BLOCK_SUBMITTED;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      clk_event_t block_evt;
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt,
+      ^{
+          res[tid] = BLOCK_COMPLETED;
+       });
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+      retain_event(block_evt);
+      release_event(block_evt);
+
+      //check block is not started
+      if (res[tid] == BLOCK_SUBMITTED)
+      {
+        clk_event_t my_evt;
+        enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt, &my_evt,
+        ^{
+           //check block is completed
+           if (res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;
+         });
+        release_event(my_evt);
+      }
+
+      set_user_event_status(user_evt, CL_COMPLETE);
+
+      release_event(user_evt);
+      release_event(block_evt);
+    }
+)" };
+
+static const char* enqueue_block_with_wait_list_and_local_arg[] = { R"(
+    #define LOCAL_MEM_SIZE 10
+    #define BLOCK_COMPLETED 1
+    #define BLOCK_SUBMITTED 2
+    #define BLOCK_STARTED   3
+    #define CHECK_SUCCESS   0
+
+    void block_fn_local_arg(size_t tid, int mul, __global int* res, __local int* tmp)
+    {
+      res[tid] = BLOCK_STARTED;
+      for (int i = 0; i < LOCAL_MEM_SIZE; i++)
+      {
+        tmp[i] = mul * 7 - 21;
+        res[tid] += tmp[i];
+      }
+      if (res[tid] == BLOCK_STARTED) res[tid] = BLOCK_COMPLETED;
+    }
+
+    kernel void enqueue_block_with_wait_list_and_local_arg(__global int* res)
+    {
+      int multiplier = 3;
+      size_t tid = get_global_id(0);
+      clk_event_t user_evt = create_user_event();
+
+      res[tid] = BLOCK_SUBMITTED;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      clk_event_t block_evt;
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt,
+        ^(__local void* buf) {
+           block_fn_local_arg(tid, multiplier, res, (__local int*)buf);
+         }, LOCAL_MEM_SIZE*sizeof(int));
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+      retain_event(block_evt);
+      release_event(block_evt);
+
+      //check block is not started
+      if (res[tid] == BLOCK_SUBMITTED)
+      {
+        clk_event_t my_evt;
+        enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt, &my_evt,
+        ^{
+           //check block is completed
+           if (res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;
+         });
+        release_event(my_evt);
+      }
+
+      set_user_event_status(user_evt, CL_COMPLETE);
+
+      release_event(user_evt);
+      release_event(block_evt);
+    }
+)" };
+
+static const char* enqueue_block_get_kernel_work_group_size[] = { R"(
+    void block_fn(size_t tid, int mul, __global int* res)
+    {
+      res[tid] = mul * 7 - 21;
+    }
+
+    kernel void enqueue_block_get_kernel_work_group_size(__global int* res)
+    {
+        int multiplier = 3;
+        size_t tid = get_global_id(0);
+
+        void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };
+
+        size_t local_work_size = get_kernel_work_group_size(kernelBlock);
+        if (local_work_size <= 0){ res[tid] = -1; return; }
+        size_t global_work_size = local_work_size * 4;
+
+        res[tid] = -1;
+        queue_t q1 = get_default_queue();
+        ndrange_t ndrange = ndrange_1D(global_work_size, local_work_size);
+
+        int enq_res = enqueue_kernel(q1, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
+
+static const char* enqueue_block_get_kernel_preferred_work_group_size_multiple[] = { R"(
+    void block_fn(size_t tid, int mul, __global int* res)
+    {
+      res[tid] = mul * 7 - 21;
+    }
+
+    kernel void enqueue_block_get_kernel_preferred_work_group_size_multiple(__global int* res)
+    {
+        int multiplier = 3;
+        size_t tid = get_global_id(0);
+
+        void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };
+
+        size_t local_work_size = get_kernel_preferred_work_group_size_multiple(kernelBlock);
+        if (local_work_size <= 0){ res[tid] = -1; return; }
+        size_t global_work_size = local_work_size * 4;
+
+        res[tid] = -1;
+        queue_t q1 = get_default_queue();
+        ndrange_t ndrange = ndrange_1D(global_work_size, local_work_size);
+
+        int enq_res = enqueue_kernel(q1, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
+
+static const char* enqueue_block_capture_event_profiling_info_after_execution[] = {
+    "#define MAX_GWS " STRINGIFY_VALUE(MAX_GWS) "\n"
+    , R"(
+    __global ulong value[MAX_GWS*2] = {0};
+
+    void block_fn(size_t tid, __global int* res)
+    {
+        res[tid] = -2;
+    }
+
+    void check_res(size_t tid, const clk_event_t evt, __global int* res)
+    {
+        capture_event_profiling_info (evt, CLK_PROFILING_COMMAND_EXEC_TIME, &value[tid*2]);
+
+        if (value[tid*2] > 0 && value[tid*2+1] > 0) res[tid] =  0;
+        else                                        res[tid] = -4;
+        release_event(evt);
+    }
+
+    kernel void enqueue_block_capture_event_profiling_info_after_execution(__global int* res)
+    {
+        size_t tid = get_global_id(0);
+
+        res[tid] = -1;
+        queue_t def_q = get_default_queue();
+        ndrange_t ndrange = ndrange_1D(1);
+        clk_event_t block_evt1;
+
+        void (^kernelBlock)(void)  = ^{ block_fn (tid, res);                   };
+
+        int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 0, NULL, &block_evt1, kernelBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+        void (^checkBlock) (void)  = ^{ check_res(tid, block_evt1, res);      };
+
+        enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt1, NULL, checkBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -3; return; }
+    }
+)" };
+
+static const char* enqueue_block_capture_event_profiling_info_before_execution[] = {
+    "#define MAX_GWS " STRINGIFY_VALUE(MAX_GWS) "\n"
+    , R"(
+    __global ulong value[MAX_GWS*2] = {0};
+
+    void block_fn(size_t tid, __global int* res)
+    {
+        res[tid] = -2;
+    }
+
+    void check_res(size_t tid, const ulong *value, __global int* res)
+    {
+        if (value[tid*2] > 0 && value[tid*2+1] > 0) res[tid] =  0;
+        else                                        res[tid] = -4;
+    }
+
+    kernel void enqueue_block_capture_event_profiling_info_before_execution(__global int* res)
+    {
+        int multiplier = 3;
+        size_t tid = get_global_id(0);
+        clk_event_t user_evt = create_user_event();
+
+        res[tid] = -1;
+        queue_t def_q = get_default_queue();
+        ndrange_t ndrange = ndrange_1D(1);
+        clk_event_t block_evt1;
+        clk_event_t block_evt2;
+
+        void (^kernelBlock)(void)  = ^{ block_fn (tid, res);                   };
+
+        int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt1, kernelBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+        capture_event_profiling_info (block_evt1, CLK_PROFILING_COMMAND_EXEC_TIME, &value[tid*2]);
+
+        set_user_event_status(user_evt, CL_COMPLETE);
+
+        void (^checkBlock) (void)  = ^{ check_res(tid, &value, res);      };
+
+        enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt1, &block_evt2, checkBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -3; return; }
+
+        release_event(user_evt);
+        release_event(block_evt1);
+        release_event(block_evt2);
+    }
+)" };
+
+static const char* enqueue_block_with_barrier[] = { R"(
+    void block_fn(size_t tid, int mul, __global int* res)
+    {
+      if (mul > 0) barrier(CLK_GLOBAL_MEM_FENCE);
+      res[tid] = mul * 7 -21;
+    }
+
+    void loop_fn(size_t tid, int n, __global int* res)
+    {
+      while (n > 0)
+      {
+        barrier(CLK_GLOBAL_MEM_FENCE);
+        res[tid] = 0;
+        --n;
+      }
+    }
+
+    kernel void enqueue_block_with_barrier(__global int* res)
+    {
+      int multiplier = 3;
+      size_t tid = get_global_id(0);
+      queue_t def_q = get_default_queue();
+      res[tid] = -1;
+      size_t n = 256;
+
+      void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };
+
+      ndrange_t ndrange = ndrange_1D(n);
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+      void (^loopBlock)(void) = ^{ loop_fn(tid, n, res); };
+
+      enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, loopBlock);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
+
+static const char* enqueue_marker_with_block_event[] = { R"(
+    #define BLOCK_COMPLETED 1
+    #define BLOCK_SUBMITTED 2
+    #define CHECK_SUCCESS   0
+
+    kernel void enqueue_marker_with_block_event(__global int* res)
+    {
+      size_t tid = get_global_id(0);
+
+      clk_event_t user_evt = create_user_event();
+
+      res[tid] = BLOCK_SUBMITTED;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+
+      clk_event_t block_evt1;
+      clk_event_t marker_evt;
+
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt1,
+      ^{
+         res[tid] = BLOCK_COMPLETED;
+       });
+      if (enq_res != CLK_SUCCESS) { res[tid] = -2; return; }
+
+      enq_res = enqueue_marker(def_q, 1, &block_evt1, &marker_evt);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -3; return; }
+
+      retain_event(marker_evt);
+      release_event(marker_evt);
+
+      //check block is not started
+      if (res[tid] == BLOCK_SUBMITTED)
+      {
+        clk_event_t my_evt;
+        enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &my_evt,
+        ^{
+           //check block is completed
+           if (res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;
+         });
+        release_event(my_evt);
+      }
+
+      set_user_event_status(user_evt, CL_COMPLETE);
+
+      release_event(block_evt1);
+      release_event(marker_evt);
+      release_event(user_evt);
+    }
+)" };
+
+static const char* enqueue_marker_with_user_event[] = { R"(
+    #define BLOCK_COMPLETED 1
+    #define BLOCK_SUBMITTED 2
+    #define CHECK_SUCCESS   0
+
+    kernel void enqueue_marker_with_user_event(__global int* res)
+    {
+      size_t tid = get_global_id(0);
+      uint multiplier = 7;
+
+      clk_event_t user_evt = create_user_event();
+
+      res[tid] = BLOCK_SUBMITTED;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+
+      clk_event_t marker_evt;
+      clk_event_t block_evt;
+
+      int enq_res = enqueue_marker(def_q, 1, &user_evt, &marker_evt);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+      retain_event(marker_evt);
+      release_event(marker_evt);
+
+      enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &block_evt,
+      ^{
+         if (res[tid] == BLOCK_SUBMITTED) res[tid] = CHECK_SUCCESS;
+       });
+
+      //check block is not started
+      if (res[tid] != BLOCK_SUBMITTED)  { res[tid] = -2; return; }
+
+      set_user_event_status(user_evt, CL_COMPLETE);
+
+      release_event(block_evt);
+      release_event(marker_evt);
+      release_event(user_evt);
+    }
+)" };
+
+static const char* enqueue_marker_with_mixed_events[] = { R"(
+    #define BLOCK_COMPLETED 1
+    #define BLOCK_SUBMITTED 2
+    #define CHECK_SUCCESS   0
+
+    kernel void enqueue_marker_with_mixed_events(__global int* res)
+    {
+      size_t tid = get_global_id(0);
+
+      clk_event_t mix_ev[2];
+      mix_ev[0] = create_user_event();
+
+      res[tid] = BLOCK_SUBMITTED;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &mix_ev[0], &mix_ev[1],
+      ^{
+         res[tid] = BLOCK_COMPLETED;
+       });
+      if (enq_res != CLK_SUCCESS) { res[tid] = -2; return; }
+
+      clk_event_t marker_evt;
+
+      enq_res = enqueue_marker(def_q, 2, mix_ev, &marker_evt);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -3; return; }
+
+      retain_event(marker_evt);
+      release_event(marker_evt);
+
+      //check block is not started
+      if (res[tid] == BLOCK_SUBMITTED)
+      {
+        clk_event_t my_evt;
+        enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &my_evt,
+        ^{
+           //check block is completed
+           if (res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;
+         });
+        release_event(my_evt);
+      }
+
+      set_user_event_status(mix_ev[0], CL_COMPLETE);
+
+      release_event(mix_ev[1]);
+      release_event(marker_evt);
+      release_event(mix_ev[0]);
+    }
+)" };
+
+static const char* enqueue_block_with_mixed_events[] = { R"(
+    kernel void enqueue_block_with_mixed_events(__global int* res)
+    {
+      int enq_res;
+      size_t tid = get_global_id(0);
+      clk_event_t mix_ev[3];
+      mix_ev[0] = create_user_event();
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      res[tid] = -2;
+
+      enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &mix_ev[0], &mix_ev[1], ^{ res[tid]++; });
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+      enq_res = enqueue_marker(def_q, 1, &mix_ev[1], &mix_ev[2]);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -3; return; }
+
+      enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, sizeof(mix_ev)/sizeof(mix_ev[0]), mix_ev, NULL, ^{ res[tid]++; });
+      if (enq_res != CLK_SUCCESS) { res[tid] = -4; return; }
+
+      set_user_event_status(mix_ev[0], CL_COMPLETE);
+
+      release_event(mix_ev[0]);
+      release_event(mix_ev[1]);
+      release_event(mix_ev[2]);
+    }
+)" };
+// clang-format on
 
 static const kernel_src sources_enqueue_block[] =
 {
diff --git a/test_conformance/extensions/cl_khr_command_buffer/CMakeLists.txt b/test_conformance/extensions/cl_khr_command_buffer/CMakeLists.txt
index 4b9968c..8a4a116 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/CMakeLists.txt
+++ b/test_conformance/extensions/cl_khr_command_buffer/CMakeLists.txt
@@ -3,6 +3,7 @@
 set(${MODULE_NAME}_SOURCES
     main.cpp
     basic_command_buffer.cpp
+    svm_command_basic.cpp
     command_buffer_printf.cpp
     command_buffer_get_command_buffer_info.cpp
     command_buffer_set_kernel_arg.cpp
@@ -14,8 +15,11 @@
     command_buffer_test_copy.cpp
     command_buffer_test_barrier.cpp
     command_buffer_test_event_info.cpp
+    command_buffer_finalize.cpp
 )
 
+set_gnulike_module_compile_flags("-Wno-sign-compare")
+
 include(../../CMakeCommon.txt)
 
 add_subdirectory( cl_khr_command_buffer_mutable_dispatch )
diff --git a/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp
index 43734da..6c02f9f 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp
@@ -201,14 +201,33 @@
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_int> output_data(num_elements);
+        std::vector<cl_int> output_data_1(num_elements);
         error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
-                                    output_data.data(), 0, nullptr, nullptr);
+                                    output_data_1.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < num_elements; i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern, output_data_1[i], i);
+        }
+
+        const cl_int new_pattern = 12;
+        error = clEnqueueFillBuffer(queue, in_mem, &new_pattern, sizeof(cl_int),
+                                    0, data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_int> output_data_2(num_elements);
+        error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
+                                    output_data_2.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(new_pattern, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
diff --git a/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.h b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.h
index a20229e..d08a11a 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef _CL_KHR_BASIC_COMMAND_BUFFER_H
-#define _CL_KHR_BASIC_COMMAND_BUFFER_H
+#ifndef CL_KHR_BASIC_COMMAND_BUFFER_H
+#define CL_KHR_BASIC_COMMAND_BUFFER_H
 
 #include "command_buffer_test_base.h"
 #include "harness/typeWrappers.h"
@@ -28,12 +28,24 @@
     {                                                                          \
         if (reference != result)                                               \
         {                                                                      \
-            log_error("Expected %d was %d at index %u\n", reference, result,   \
+            log_error("Expected %d was %d at index %zu\n", reference, result,  \
                       index);                                                  \
             return TEST_FAIL;                                                  \
         }                                                                      \
     }
 
+// If it is supported get the addresses of all the APIs here.
+#define GET_EXTENSION_ADDRESS(FUNC)                                            \
+    FUNC = reinterpret_cast<FUNC##_fn>(                                        \
+        clGetExtensionFunctionAddressForPlatform(platform, #FUNC));            \
+    if (FUNC == nullptr)                                                       \
+    {                                                                          \
+        log_error("ERROR: clGetExtensionFunctionAddressForPlatform failed"     \
+                  " with " #FUNC "\n");                                        \
+        return TEST_FAIL;                                                      \
+    }
+
+
 // Helper test fixture for constructing OpenCL objects used in testing
 // a variety of simple command-buffer enqueue scenarios.
 struct BasicCommandBufferTest : CommandBufferTestBase
@@ -70,6 +82,7 @@
     clCommandBufferWrapper command_buffer;
 };
 
+
 template <class T>
 int MakeAndRunTest(cl_device_id device, cl_context context,
                    cl_command_queue queue, int num_elements)
@@ -99,4 +112,4 @@
     return TEST_PASS;
 }
 
-#endif // _CL_KHR_BASIC_COMMAND_BUFFER_H
+#endif // CL_KHR_BASIC_COMMAND_BUFFER_H
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/CMakeLists.txt b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/CMakeLists.txt
index e062583..9b598d8 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/CMakeLists.txt
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/CMakeLists.txt
@@ -3,7 +3,15 @@
 set(${MODULE_NAME}_SOURCES
     main.cpp
     mutable_command_info.cpp
+    mutable_command_image_arguments.cpp
+    mutable_command_arguments.cpp
+    mutable_command_out_of_order.cpp
+    mutable_command_global_size.cpp
+    mutable_command_local_size.cpp
+    mutable_command_global_offset.cpp
     ../basic_command_buffer.cpp
 )
 
+set_gnulike_module_compile_flags("-Wno-sign-compare")
+
 include(../../../CMakeCommon.txt)
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/main.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/main.cpp
index 9707579..a2fae49 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/main.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/main.cpp
@@ -26,6 +26,18 @@
     ADD_TEST(mutable_command_info_global_work_offset),
     ADD_TEST(mutable_command_info_local_work_size),
     ADD_TEST(mutable_command_info_global_work_size),
+    ADD_TEST(mutable_dispatch_image_1d_arguments),
+    ADD_TEST(mutable_dispatch_image_2d_arguments),
+    ADD_TEST(mutable_dispatch_out_of_order),
+    ADD_TEST(mutable_dispatch_simultaneous_out_of_order),
+    ADD_TEST(mutable_dispatch_global_size),
+    ADD_TEST(mutable_dispatch_local_size),
+    ADD_TEST(mutable_dispatch_global_offset),
+    ADD_TEST(mutable_dispatch_svm_arguments),
+    ADD_TEST(mutable_dispatch_local_arguments),
+    ADD_TEST(mutable_dispatch_global_arguments),
+    ADD_TEST(mutable_dispatch_pod_arguments),
+    ADD_TEST(mutable_dispatch_null_arguments),
 };
 
 int main(int argc, const char *argv[])
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_arguments.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_arguments.cpp
new file mode 100644
index 0000000..5c8291f
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_arguments.cpp
@@ -0,0 +1,847 @@
+//
+// Copyright (c) 2022 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "testHarness.h"
+#include "imageHelpers.h"
+#include "mutable_command_basic.h"
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+////////////////////////////////////////////////////////////////////////////////
+// mutable dispatch tests which handle following cases for
+// CL_MUTABLE_DISPATCH_ARGUMENTS_KHR:
+// - __global arguments
+// - __local arguments
+// - plain-old-data arguments
+// - NULL arguments
+// - SVM arguments
+
+struct MutableDispatchGlobalArguments : public BasicMutableCommandBufferTest
+{
+    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+
+    MutableDispatchGlobalArguments(cl_device_id device, cl_context context,
+                                   cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    virtual cl_int SetUp(int elements) override
+    {
+        BasicMutableCommandBufferTest::SetUp(elements);
+
+        return 0;
+    }
+
+    cl_int Run() override
+    {
+        cl_int error;
+
+        // Create kernel
+
+        const char *sample_const_arg_kernel =
+            R"(
+            __kernel void sample_test(__constant int *src, __global int *dst)
+            {
+                size_t  tid = get_global_id(0);
+                dst[tid] = src[tid];
+            })";
+
+        error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                            &sample_const_arg_kernel,
+                                            "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        // Create and initialize buffers
+
+        MTdataHolder d(gRandomSeed);
+
+        std::vector<cl_int> srcData(num_elements);
+        for (size_t i = 0; i < num_elements; i++)
+            srcData[i] = (cl_int)genrand_int32(d);
+
+        clMemWrapper srcBuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                                             num_elements * sizeof(cl_int),
+                                             srcData.data(), &error);
+        test_error(error, "Creating src buffer");
+
+        clMemWrapper dstBuf0 =
+            clCreateBuffer(context, CL_MEM_READ_WRITE,
+                           num_elements * sizeof(cl_int), NULL, &error);
+        test_error(error, "Creating initial dst buffer failed");
+
+        clMemWrapper dstBuf1 =
+            clCreateBuffer(context, CL_MEM_READ_WRITE,
+                           num_elements * sizeof(cl_int), NULL, &error);
+        test_error(error, "Creating updated dst buffer failed");
+
+        // Build and execute the command buffer for the initial execution
+
+        error = clSetKernelArg(kernel, 0, sizeof(srcBuf), &srcBuf);
+        test_error(error, "Unable to set src kernel arguments");
+
+        error = clSetKernelArg(kernel, 1, sizeof(dstBuf0), &dstBuf0);
+        test_error(error, "Unable to set initial dst kernel argument");
+
+        cl_ndrange_kernel_command_properties_khr props[] = {
+            CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
+            CL_MUTABLE_DISPATCH_ARGUMENTS_KHR, 0
+        };
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, props, kernel, 1, nullptr, &num_elements,
+            nullptr, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        // Check the results of the initial execution
+
+        std::vector<cl_int> dstData0(num_elements);
+        error = clEnqueueReadBuffer(queue, dstBuf0, CL_TRUE, 0,
+                                    num_elements * sizeof(cl_int),
+                                    dstData0.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer for initial dst failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            if (srcData[i] != dstData0[i])
+            {
+                log_error("Initial data failed to verify: src[%zu]=%d != "
+                          "dst[%zu]=%d\n",
+                          i, srcData[i], i, dstData0[i]);
+                return TEST_FAIL;
+            }
+        }
+
+        // Modify and execute the command buffer
+
+        cl_mutable_dispatch_arg_khr arg{ 1, sizeof(dstBuf1), &dstBuf1 };
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            1 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            &arg /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        // Check the results of the modified execution
+
+        std::vector<cl_int> dstData1(num_elements);
+        error = clEnqueueReadBuffer(queue, dstBuf1, CL_TRUE, 0,
+                                    num_elements * sizeof(cl_int),
+                                    dstData1.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer for modified dst failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            if (srcData[i] != dstData1[i])
+            {
+                log_error("Initial data failed to verify: src[%zu]=%d != "
+                          "dst[%zu]=%d\n",
+                          i, srcData[i], i, dstData1[i]);
+                return TEST_FAIL;
+            }
+        }
+
+        return TEST_PASS;
+    }
+
+    cl_mutable_command_khr command = nullptr;
+};
+
+struct MutableDispatchLocalArguments : public BasicMutableCommandBufferTest
+{
+    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+
+    MutableDispatchLocalArguments(cl_device_id device, cl_context context,
+                                  cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    virtual cl_int SetUp(int elements) override
+    {
+        BasicMutableCommandBufferTest::SetUp(elements);
+
+        return 0;
+    }
+
+    cl_int Run() override
+    {
+        const char *sample_const_arg_kernel =
+            R"(
+            __kernel void sample_test(__constant int *src1, __local int
+            *src, __global int *dst)
+            {
+                size_t  tid = get_global_id(0);
+                src[tid] = src1[tid];
+                dst[tid] = src[tid];
+            })";
+
+        cl_int error;
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+        size_t threads[1], localThreads[1];
+        std::vector<cl_int> constantData;
+        std::vector<cl_int> resultData;
+
+        error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                            &sample_const_arg_kernel,
+                                            "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        MTdataHolder d(gRandomSeed);
+
+        size_t sizeToAllocate =
+            ((size_t)max_size / sizeof(cl_int)) * sizeof(cl_int);
+        size_t numberOfInts = sizeToAllocate / sizeof(cl_int);
+        constantData.resize(sizeToAllocate / sizeof(cl_int));
+        resultData.resize(sizeToAllocate / sizeof(cl_int));
+
+        for (size_t i = 0; i < numberOfInts; i++)
+            constantData[i] = (cl_int)genrand_int32(d);
+
+        clMemWrapper streams[2];
+        streams[0] =
+            clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeToAllocate,
+                           constantData.data(), &error);
+        test_error(error, "Creating test array failed");
+        streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeToAllocate,
+                                    nullptr, &error);
+        test_error(error, "Creating test array failed");
+
+        /* Set the arguments */
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &streams[0]);
+        test_error(error, "Unable to set indexed kernel arguments");
+        error =
+            clSetKernelArg(kernel, 1, numberOfInts * sizeof(cl_int), nullptr);
+        test_error(error, "Unable to set indexed kernel arguments");
+        error = clSetKernelArg(kernel, 2, sizeof(cl_mem), &streams[1]);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        threads[0] = numberOfInts;
+        localThreads[0] = 1;
+
+        cl_ndrange_kernel_command_properties_khr props[] = {
+            CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
+            CL_MUTABLE_DISPATCH_ARGUMENTS_KHR, 0
+        };
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, props, kernel, 1, nullptr, threads,
+            localThreads, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        cl_mutable_dispatch_arg_khr arg_1{ 1, sizeof(cl_mem), nullptr };
+        cl_mutable_dispatch_arg_khr args[] = { arg_1 };
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            1 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            args /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed.");
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error =
+            clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, sizeToAllocate,
+                                resultData.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < numberOfInts; i++)
+            if (constantData[i] != resultData[i])
+            {
+                log_error("Data failed to verify: constantData[%d]=%d != "
+                          "resultData[%d]=%d\n",
+                          i, constantData[i], i, resultData[i]);
+                return TEST_FAIL;
+            }
+
+        return TEST_PASS;
+    }
+
+    cl_mutable_command_khr command = nullptr;
+    const cl_ulong max_size = 16;
+};
+
+struct MutableDispatchPODArguments : public BasicMutableCommandBufferTest
+{
+    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+
+    MutableDispatchPODArguments(cl_device_id device, cl_context context,
+                                cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    cl_int Run() override
+    {
+        const char *sample_const_arg_kernel =
+            R"(
+                __kernel void sample_test(__constant int *src, int dst)
+            {
+                size_t  tid = get_global_id(0);
+                dst = src[tid];
+            })";
+
+        cl_int error;
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+        size_t threads[1], localThreads[1];
+        std::vector<cl_int> constantData;
+        std::vector<cl_int> resultData;
+
+        error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                            &sample_const_arg_kernel,
+                                            "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        MTdataHolder d(gRandomSeed);
+
+        size_t sizeToAllocate =
+            ((size_t)max_size / sizeof(cl_int)) * sizeof(cl_int);
+        size_t numberOfInts = sizeToAllocate / sizeof(cl_int);
+        constantData.resize(sizeToAllocate / sizeof(cl_int));
+        resultData.resize(sizeToAllocate / sizeof(cl_int));
+
+        for (size_t i = 0; i < numberOfInts; i++)
+            constantData[i] = (cl_int)genrand_int32(d);
+
+        clMemWrapper stream;
+        stream = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeToAllocate,
+                                constantData.data(), &error);
+        test_error(error, "Creating test array failed");
+
+
+        /* Set the arguments */
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &stream);
+        test_error(error, "Unable to set indexed kernel arguments");
+        cl_int intarg = 10;
+        error = clSetKernelArg(kernel, 1, sizeof(cl_int), &intarg);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        threads[0] = numberOfInts;
+        localThreads[0] = 1;
+
+        cl_ndrange_kernel_command_properties_khr props[] = {
+            CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
+            CL_MUTABLE_DISPATCH_ARGUMENTS_KHR, 0
+        };
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, props, kernel, 1, nullptr, threads,
+            localThreads, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        intarg = 20;
+        cl_mutable_dispatch_arg_khr arg_1{ 1, sizeof(cl_int), &intarg };
+        cl_mutable_dispatch_arg_khr args[] = { arg_1 };
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            1 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            args /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed.");
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueReadBuffer(queue, stream, CL_TRUE, 0, sizeToAllocate,
+                                    resultData.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < numberOfInts; i++)
+            if (constantData[i] != resultData[i])
+            {
+                log_error("Data failed to verify: constantData[%d]=%d != "
+                          "resultData[%d]=%d\n",
+                          i, constantData[i], i, resultData[i]);
+                return TEST_FAIL;
+            }
+
+        return TEST_PASS;
+    }
+
+    cl_mutable_command_khr command = nullptr;
+    const cl_ulong max_size = 16;
+};
+
+struct MutableDispatchNullArguments : public BasicMutableCommandBufferTest
+{
+    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+
+    MutableDispatchNullArguments(cl_device_id device, cl_context context,
+                                 cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    cl_int Run() override
+    {
+        cl_int error;
+
+        // Create kernel
+
+        const char *sample_const_arg_kernel =
+            R"(
+            __kernel void sample_test(__constant int *src, __global int *dst)
+            {
+                size_t  tid = get_global_id(0);
+                dst[tid] = src ? src[tid] : 12345;
+            })";
+
+        error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                            &sample_const_arg_kernel,
+                                            "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        MTdataHolder d(gRandomSeed);
+
+        std::vector<cl_int> srcData(num_elements);
+        for (size_t i = 0; i < num_elements; i++)
+            srcData[i] = (cl_int)genrand_int32(d);
+
+        clMemWrapper srcBuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                                             num_elements * sizeof(cl_int),
+                                             srcData.data(), &error);
+        test_error(error, "Creating src buffer");
+
+        clMemWrapper dstBuf =
+            clCreateBuffer(context, CL_MEM_READ_WRITE,
+                           num_elements * sizeof(cl_int), NULL, &error);
+        test_error(error, "Creating dst buffer failed");
+
+        // Build and execute the command buffer for the initial execution
+
+        error = clSetKernelArg(kernel, 0, sizeof(srcBuf), &srcBuf);
+        test_error(error, "Unable to set src kernel arguments");
+
+        error = clSetKernelArg(kernel, 1, sizeof(dstBuf), &dstBuf);
+        test_error(error, "Unable to set initial dst kernel argument");
+
+        cl_ndrange_kernel_command_properties_khr props[] = {
+            CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
+            CL_MUTABLE_DISPATCH_ARGUMENTS_KHR, 0
+        };
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, props, kernel, 1, nullptr, &num_elements,
+            nullptr, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        // Check the results of the initial execution
+
+        std::vector<cl_int> dstData0(num_elements);
+        error = clEnqueueReadBuffer(queue, dstBuf, CL_TRUE, 0,
+                                    num_elements * sizeof(cl_int),
+                                    dstData0.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer for initial dst failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            if (srcData[i] != dstData0[i])
+            {
+                log_error("Initial data failed to verify: src[%zu]=%d != "
+                          "dst[%zu]=%d\n",
+                          i, srcData[i], i, dstData0[i]);
+                return TEST_FAIL;
+            }
+        }
+
+        // Modify and execute the command buffer
+
+        cl_mutable_dispatch_arg_khr arg{ 0, sizeof(cl_mem), nullptr };
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            1 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            &arg /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        // Check the results of the modified execution
+
+        std::vector<cl_int> dstData1(num_elements);
+        error = clEnqueueReadBuffer(queue, dstBuf, CL_TRUE, 0,
+                                    num_elements * sizeof(cl_int),
+                                    dstData1.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer for modified dst failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            if (12345 != dstData1[i])
+            {
+                log_error("Modified data failed to verify: %d != dst[%zu]=%d\n",
+                          12345, i, dstData1[i]);
+                return TEST_FAIL;
+            }
+        }
+
+        return TEST_PASS;
+    }
+
+    cl_mutable_command_khr command = nullptr;
+    const cl_ulong max_size = 16;
+};
+
+struct MutableDispatchSVMArguments : public BasicMutableCommandBufferTest
+{
+    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+
+    MutableDispatchSVMArguments(cl_device_id device, cl_context context,
+                                cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    bool Skip() override
+    {
+        cl_device_svm_capabilities svm_caps;
+        bool svm_capabilities =
+            !clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES,
+                             sizeof(svm_caps), &svm_caps, NULL)
+            && svm_caps != 0;
+
+        return !svm_capabilities || BasicMutableCommandBufferTest::Skip();
+    }
+
+    virtual cl_int SetUp(int elements) override
+    {
+        BasicMutableCommandBufferTest::SetUp(elements);
+
+        const char *svm_arguments_kernel =
+            R"(
+            typedef struct {
+                global int* ptr;
+            } wrapper;
+            __kernel void test_svm_arguments(__global wrapper* pWrapper)
+            {
+                size_t i = get_global_id(0);
+                pWrapper->ptr[i]++;
+            })";
+
+        create_single_kernel_helper(context, &program, &kernel, 1,
+                                    &svm_arguments_kernel,
+                                    "test_svm_arguments");
+
+        return 0;
+    }
+
+    cl_int Run() override
+    {
+        const cl_int zero = 0;
+        cl_int error;
+
+        // Allocate and initialize SVM for initial execution
+
+        cl_int *initWrapper = (cl_int *)clSVMAlloc(context, CL_MEM_READ_WRITE,
+                                                   sizeof(cl_int *), 0);
+        cl_int *initBuffer = (cl_int *)clSVMAlloc(
+            context, CL_MEM_READ_WRITE, num_elements * sizeof(cl_int), 0);
+        test_assert_error(initWrapper != nullptr && initBuffer != nullptr,
+                          "clSVMAlloc failed for initial execution");
+
+        error = clEnqueueSVMMemcpy(queue, CL_TRUE, initWrapper, &initBuffer,
+                                   sizeof(cl_int *), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueSVMMemcpy failed for initWrapper");
+
+        error = clEnqueueSVMMemFill(queue, initBuffer, &zero, sizeof(zero),
+                                    num_elements * sizeof(cl_int), 0, nullptr,
+                                    nullptr);
+        test_error(error, "clEnqueueSVMMemFill failed for initBuffer");
+
+        // Allocate and initialize SVM for modified execution
+
+        cl_int *newWrapper =
+            (cl_int *)clSVMAlloc(context, CL_MEM_READ_WRITE, sizeof(cl_int), 0);
+        cl_int *newBuffer = (cl_int *)clSVMAlloc(
+            context, CL_MEM_READ_WRITE, num_elements * sizeof(cl_int), 0);
+        test_assert_error(newWrapper != nullptr && newBuffer != nullptr,
+                          "clSVMAlloc failed for modified execution");
+
+        error = clEnqueueSVMMemcpy(queue, CL_TRUE, newWrapper, &newBuffer,
+                                   sizeof(cl_int *), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueSVMMemcpy failed for newWrapper");
+
+        error = clEnqueueSVMMemFill(queue, newBuffer, &zero, sizeof(zero),
+                                    num_elements * sizeof(cl_int), 0, nullptr,
+                                    nullptr);
+        test_error(error, "clEnqueueSVMMemFill failed for newB");
+
+        // Build and execute the command buffer for the initial execution
+
+        error = clSetKernelArgSVMPointer(kernel, 0, initWrapper);
+        test_error(error, "clSetKernelArg failed for initWrapper");
+
+        error = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_SVM_PTRS,
+                                    sizeof(initBuffer), &initBuffer);
+        test_error(error, "clSetKernelExecInfo failed for initBuffer");
+
+        cl_ndrange_kernel_command_properties_khr props[] = {
+            CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
+            CL_MUTABLE_DISPATCH_ARGUMENTS_KHR
+                | CL_MUTABLE_DISPATCH_EXEC_INFO_KHR,
+            0
+        };
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, props, kernel, 1, nullptr, &num_elements,
+            nullptr, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+
+        // Check the results of the initial execution
+
+        error =
+            clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_READ, initBuffer,
+                            num_elements * sizeof(cl_int), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueSVMMap failed for initBuffer");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            if (initBuffer[i] != 1)
+            {
+                log_error("Initial verification failed at index %zu: Got %d, "
+                          "wanted 1\n",
+                          i, initBuffer[i]);
+                return TEST_FAIL;
+            }
+        }
+
+        error = clEnqueueSVMUnmap(queue, initBuffer, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueSVMUnmap failed for initBuffer");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+
+        // Modify and execute the command buffer
+
+        cl_mutable_dispatch_arg_khr arg_svm{};
+        arg_svm.arg_index = 0;
+        arg_svm.arg_value = newWrapper;
+
+        cl_mutable_dispatch_exec_info_khr exec_info{};
+        exec_info.param_name = CL_KERNEL_EXEC_INFO_SVM_PTRS;
+        exec_info.param_value_size = sizeof(newBuffer);
+        exec_info.param_value = &newBuffer;
+
+        cl_mutable_dispatch_config_khr dispatch_config{};
+        dispatch_config.type = CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR;
+        dispatch_config.command = command;
+        dispatch_config.num_svm_args = 1;
+        dispatch_config.arg_svm_list = &arg_svm;
+        dispatch_config.num_exec_infos = 1;
+        dispatch_config.exec_info_list = &exec_info;
+
+        cl_mutable_base_config_khr mutable_config{};
+        mutable_config.type = CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR;
+        mutable_config.num_mutable_dispatch = 1;
+        mutable_config.mutable_dispatch_list = &dispatch_config;
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        // Check the results of the modified execution
+
+        error =
+            clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_READ, newBuffer,
+                            num_elements * sizeof(cl_int), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueSVMMap failed for newBuffer");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            if (newBuffer[i] != 1)
+            {
+                log_error("Modified verification failed at index %zu: Got %d, "
+                          "wanted 1\n",
+                          i, newBuffer[i]);
+                return TEST_FAIL;
+            }
+        }
+
+        error = clEnqueueSVMUnmap(queue, newBuffer, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueSVMUnmap failed for newBuffer");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+
+        // Clean up
+
+        clSVMFree(context, initWrapper);
+        clSVMFree(context, initBuffer);
+        clSVMFree(context, newWrapper);
+        clSVMFree(context, newBuffer);
+
+        return TEST_PASS;
+    }
+
+    cl_mutable_command_khr command = nullptr;
+};
+
+
+int test_mutable_dispatch_local_arguments(cl_device_id device,
+                                          cl_context context,
+                                          cl_command_queue queue,
+                                          int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchLocalArguments>(device, context, queue,
+                                                         num_elements);
+}
+
+int test_mutable_dispatch_global_arguments(cl_device_id device,
+                                           cl_context context,
+                                           cl_command_queue queue,
+                                           int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchGlobalArguments>(device, context,
+                                                          queue, num_elements);
+}
+
+int test_mutable_dispatch_pod_arguments(cl_device_id device, cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchPODArguments>(device, context, queue,
+                                                       num_elements);
+}
+
+int test_mutable_dispatch_null_arguments(cl_device_id device,
+                                         cl_context context,
+                                         cl_command_queue queue,
+                                         int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchNullArguments>(device, context, queue,
+                                                        num_elements);
+}
+
+int test_mutable_dispatch_svm_arguments(cl_device_id device, cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchSVMArguments>(device, context, queue,
+                                                       num_elements);
+}
\ No newline at end of file
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h
index 9056a00..1914755 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef _CL_KHR_MUTABLE_COMMAND_BASIC_H
-#define _CL_KHR_MUTABLE_COMMAND_BASIC_H
+#ifndef CL_KHR_MUTABLE_COMMAND_BASIC_H
+#define CL_KHR_MUTABLE_COMMAND_BASIC_H
 
 #include "../basic_command_buffer.h"
 #include "../command_buffer_test_base.h"
@@ -84,24 +84,52 @@
                             &platform, nullptr);
         test_error(error, "clGetDeviceInfo for CL_DEVICE_PLATFORM failed");
 
-        // If it is supported get the addresses of all the APIs here.
-#define GET_EXTENSION_ADDRESS(FUNC)                                            \
-    FUNC = reinterpret_cast<FUNC##_fn>(                                        \
-        clGetExtensionFunctionAddressForPlatform(platform, #FUNC));            \
-    if (FUNC == nullptr)                                                       \
-    {                                                                          \
-        log_error("ERROR: clGetExtensionFunctionAddressForPlatform failed"     \
-                  " with " #FUNC "\n");                                        \
-        return TEST_FAIL;                                                      \
+        GET_EXTENSION_ADDRESS(clUpdateMutableCommandsKHR);
+
+        return CL_SUCCESS;
     }
+
+    clUpdateMutableCommandsKHR_fn clUpdateMutableCommandsKHR = nullptr;
+
+    const char* kernelString = "__kernel void empty() {}";
+    const size_t global_work_size = 4 * 16;
+};
+
+struct InfoMutableCommandBufferTest : BasicMutableCommandBufferTest
+{
+    InfoMutableCommandBufferTest(cl_device_id device, cl_context context,
+                                 cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    virtual cl_int SetUp(int elements) override
+    {
+        BasicMutableCommandBufferTest::SetUp(elements);
+
+        cl_int error = init_extension_functions();
+        test_error(error, "Unable to initialise extension functions");
+
+        return CL_SUCCESS;
+    }
+
+    cl_int init_extension_functions()
+    {
+        BasicCommandBufferTest::init_extension_functions();
+
+        cl_platform_id platform;
+        cl_int error =
+            clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
+                            &platform, nullptr);
+        test_error(error, "clGetDeviceInfo for CL_DEVICE_PLATFORM failed");
+
         GET_EXTENSION_ADDRESS(clGetMutableCommandInfoKHR);
 
         return CL_SUCCESS;
     }
 
     clGetMutableCommandInfoKHR_fn clGetMutableCommandInfoKHR = nullptr;
-    const char* kernelString = "__kernel void empty() {}";
-    const size_t global_work_size = 4 * sizeof(cl_int);
 };
 
-#endif //_CL_KHR_MUTABLE_COMMAND_BASIC_H
\ No newline at end of file
+#undef GET_EXTENSION_ADDRESS
+
+#endif //_CL_KHR_MUTABLE_COMMAND_BASIC_H
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_global_offset.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_global_offset.cpp
new file mode 100644
index 0000000..80bc015
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_global_offset.cpp
@@ -0,0 +1,170 @@
+//
+// Copyright (c) 2022 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include <extensionHelpers.h>
+#include "imageHelpers.h"
+#include "mutable_command_basic.h"
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// mutable dispatch tests which handle following cases:
+//
+// CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR
+
+struct MutableDispatchGlobalOffset : InfoMutableCommandBufferTest
+{
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
+
+    MutableDispatchGlobalOffset(cl_device_id device, cl_context context,
+                                cl_command_queue queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
+    {}
+
+    bool Skip() override
+    {
+        cl_mutable_dispatch_fields_khr mutable_capabilities;
+
+        bool mutable_support =
+            !clGetDeviceInfo(
+                device, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR,
+                sizeof(mutable_capabilities), &mutable_capabilities, nullptr)
+            && mutable_capabilities & CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR;
+
+        return !mutable_support || InfoMutableCommandBufferTest::Skip();
+    }
+
+    cl_int Run() override
+    {
+        const char *global_offset_kernel =
+            R"(
+                __kernel void sample_test(__global int *dst)
+            {
+                size_t tid = get_global_id(0);
+                dst[tid] = get_global_offset(0);
+            })";
+
+        cl_int error =
+            create_single_kernel_helper(context, &program, &kernel, 1,
+                                        &global_offset_kernel, "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        clMemWrapper stream;
+        stream = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeToAllocate,
+                                nullptr, &error);
+        test_error(error, "Creating test array failed");
+
+        /* Set the arguments */
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &stream);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, nullptr, kernel, 1, nullptr,
+            &global_work_size, nullptr, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed.");
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            0 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            nullptr /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            &update_global_offset /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clGetMutableCommandInfoKHR(
+            command, CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR,
+            sizeof(info_global_offset), &info_global_offset, nullptr);
+        test_error(error, "clGetMutableCommandInfoKHR failed");
+
+        if (info_global_offset != update_global_offset)
+        {
+            log_error("ERROR: Wrong size returned from "
+                      "clGetMutableCommandInfoKHR.");
+            return TEST_FAIL;
+        }
+
+        std::vector<cl_int> resultData;
+        resultData.resize(num_elements);
+
+        error = clEnqueueReadBuffer(queue, stream, CL_TRUE, 0, sizeToAllocate,
+                                    resultData.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+            if (i < update_global_offset && 0 != resultData[i])
+            {
+                log_error("Data failed to verify: update_global_offset != "
+                          "resultData[%d]=%d\n",
+                          i, resultData[i]);
+                return TEST_FAIL;
+            }
+            else if (i >= update_global_offset
+                     && update_global_offset != resultData[i])
+            {
+                log_error("Data failed to verify: update_global_offset != "
+                          "resultData[%d]=%d\n",
+                          i, resultData[i]);
+                return TEST_FAIL;
+            }
+        return CL_SUCCESS;
+    }
+
+    size_t info_global_offset = 0;
+    const size_t update_global_offset = 3;
+    const size_t sizeToAllocate =
+        (global_work_size + update_global_offset) * sizeof(cl_int);
+    const size_t num_elements = sizeToAllocate / sizeof(cl_int);
+    cl_mutable_command_khr command = nullptr;
+};
+
+int test_mutable_dispatch_global_offset(cl_device_id device, cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements)
+{
+
+    return MakeAndRunTest<MutableDispatchGlobalOffset>(device, context, queue,
+                                                       num_elements);
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_global_size.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_global_size.cpp
new file mode 100644
index 0000000..091f0c8
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_global_size.cpp
@@ -0,0 +1,167 @@
+//
+// Copyright (c) 2022 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include <extensionHelpers.h>
+#include "imageHelpers.h"
+#include "mutable_command_basic.h"
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// mutable dispatch tests which handle following cases:
+//
+// CL_MUTABLE_DISPATCH_GLOBAL_WORK_SIZE_KHR
+
+struct MutableDispatchGlobalSize : public InfoMutableCommandBufferTest
+{
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
+
+    MutableDispatchGlobalSize(cl_device_id device, cl_context context,
+                              cl_command_queue queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
+    {}
+
+    bool Skip() override
+    {
+        cl_mutable_dispatch_fields_khr mutable_capabilities;
+
+        bool mutable_support =
+            !clGetDeviceInfo(
+                device, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR,
+                sizeof(mutable_capabilities), &mutable_capabilities, nullptr)
+            && mutable_capabilities & CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR;
+
+        return !mutable_support || InfoMutableCommandBufferTest::Skip();
+    }
+
+    cl_int Run() override
+    {
+        const char *global_size_kernel =
+            R"(
+                __kernel void sample_test(__global int *dst)
+            {
+                size_t tid = get_global_id(0);
+                dst[tid] = get_global_size(0);
+            })";
+
+        cl_int error = create_single_kernel_helper(
+            context, &program, &kernel, 1, &global_size_kernel, "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        clMemWrapper stream;
+        stream = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeToAllocate,
+                                nullptr, &error);
+        test_error(error, "Creating test array failed");
+
+        /* Set the arguments */
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &stream);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, nullptr, kernel, 1, nullptr,
+            &global_work_size, nullptr, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed.");
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            0 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            nullptr /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            &update_global_size /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clGetMutableCommandInfoKHR(
+            command, CL_MUTABLE_DISPATCH_GLOBAL_WORK_SIZE_KHR,
+            sizeof(info_global_size), &info_global_size, nullptr);
+        test_error(error, "clGetMutableCommandInfoKHR failed");
+
+        if (info_global_size != update_global_size)
+        {
+            log_error("ERROR: Wrong size returned from "
+                      "clGetMutableCommandInfoKHR.");
+            return TEST_FAIL;
+        }
+
+        std::vector<cl_int> resultData;
+        resultData.resize(num_elements);
+
+        error = clEnqueueReadBuffer(queue, stream, CL_TRUE, 0, sizeToAllocate,
+                                    resultData.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+            if (i >= update_global_size && global_work_size != resultData[i])
+            {
+                log_error("Data failed to verify: update_global_size != "
+                          "resultData[%d]=%d\n",
+                          i, resultData[i]);
+                return TEST_FAIL;
+            }
+            else if (i < update_global_size
+                     && update_global_size != resultData[i])
+            {
+                log_error("Data failed to verify: update_global_size != "
+                          "resultData[%d]=%d\n",
+                          i, resultData[i]);
+                return TEST_FAIL;
+            }
+
+        return CL_SUCCESS;
+    }
+
+    size_t info_global_size = 0;
+    const size_t update_global_size = 3;
+    const size_t sizeToAllocate = global_work_size;
+    const size_t num_elements = sizeToAllocate / sizeof(cl_int);
+    cl_mutable_command_khr command = nullptr;
+};
+
+int test_mutable_dispatch_global_size(cl_device_id device, cl_context context,
+                                      cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchGlobalSize>(device, context, queue,
+                                                     num_elements);
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_image_arguments.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_image_arguments.cpp
new file mode 100644
index 0000000..b1ce25e
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_image_arguments.cpp
@@ -0,0 +1,427 @@
+//
+// Copyright (c) 2022 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include <vector>
+#include "imageHelpers.h"
+#include "mutable_command_basic.h"
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+////////////////////////////////////////////////////////////////////////////////
+// mutable dispatch tests which handle following cases for
+// CL_MUTABLE_DISPATCH_ARGUMENTS_KHR:
+// - image arguments
+
+struct MutableDispatchImage1DArguments : public BasicMutableCommandBufferTest
+{
+    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+
+    MutableDispatchImage1DArguments(cl_device_id device, cl_context context,
+                                    cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    virtual cl_int SetUp(int elements) override
+    {
+        BasicMutableCommandBufferTest::SetUp(elements);
+
+        return CL_SUCCESS;
+    }
+
+    bool Skip() override
+    {
+        cl_bool image_support;
+
+        cl_int error =
+            clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT,
+                            sizeof(image_support), &image_support, nullptr);
+        test_error(error, "clGetDeviceInfo for CL_DEVICE_IMAGE_SUPPORT failed");
+
+        cl_mutable_dispatch_fields_khr mutable_capabilities;
+
+        bool mutable_support =
+            !clGetDeviceInfo(
+                device, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR,
+                sizeof(mutable_capabilities), &mutable_capabilities, nullptr)
+            && mutable_capabilities & CL_MUTABLE_DISPATCH_ARGUMENTS_KHR;
+
+        return (!mutable_support || !image_support)
+            || BasicMutableCommandBufferTest::Skip();
+    }
+
+    cl_int Run() override
+    {
+        const char *sample_const_arg_kernel =
+            R"(__kernel void sample_test( read_only image1d_t source, sampler_t
+            sampler, write_only image1d_t dest)
+            {
+               int offset = get_global_id(0);
+
+               int4 color = read_imagei( source, sampler, offset );
+
+               write_imagei( dest, offset, color );
+            })";
+
+        cl_int error;
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+
+        cl_image_desc image_desc;
+        memset(&image_desc, 0x0, sizeof(cl_image_desc));
+        image_desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+        image_desc.image_width = 4;
+        image_desc.image_row_pitch = 0;
+        image_desc.num_mip_levels = 0;
+
+        const cl_image_format formats = { CL_RGBA, CL_UNSIGNED_INT8 };
+
+        image_descriptor imageInfo = { 0 };
+        imageInfo.type = CL_MEM_OBJECT_IMAGE1D;
+        imageInfo.format = &formats;
+        imageInfo.width = 4;
+
+        BufferOwningPtr<char> imageValues_input, imageValues_output, outputData;
+        MTdataHolder d(gRandomSeed);
+        generate_random_image_data(&imageInfo, imageValues_input, d);
+        generate_random_image_data(&imageInfo, imageValues_output, d);
+        generate_random_image_data(&imageInfo, outputData, d);
+
+        char *host_ptr_input = (char *)imageValues_input;
+        char *host_ptr_output = (char *)imageValues_output;
+
+        clMemWrapper src_image = create_image_1d(
+            context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &formats,
+            image_desc.image_width, 0, host_ptr_input, nullptr, &error);
+        test_error(error, "create_image_1d failed");
+
+        clMemWrapper dst_image = create_image_1d(
+            context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &formats,
+            image_desc.image_width, 0, host_ptr_output, nullptr, &error);
+        test_error(error, "create_image_2d failed");
+
+        error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                            &sample_const_arg_kernel,
+                                            "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        clSamplerWrapper sampler = clCreateSampler(
+            context, CL_FALSE, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &error);
+        test_error(error, "Unable to create sampler");
+
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_image);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        error = clSetKernelArg(kernel, 1, sizeof(cl_sampler), &sampler);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        error = clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_image);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        cl_ndrange_kernel_command_properties_khr props[] = {
+            CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
+            CL_MUTABLE_DISPATCH_ARGUMENTS_KHR, 0
+        };
+
+        size_t globalDim[3] = { 4, 1, 1 }, localDim[3] = { 1, 1, 1 };
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, props, kernel, 1, nullptr, globalDim,
+            localDim, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed.");
+
+        clMemWrapper new_image = create_image_1d(
+            context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &formats,
+            image_desc.image_width, 0, host_ptr_output, nullptr, &error);
+        test_error(error, "create_image_1d failed");
+
+        cl_mutable_dispatch_arg_khr arg_2{ 2, sizeof(cl_mem), &new_image };
+        cl_mutable_dispatch_arg_khr args[] = { arg_2 };
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            1 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            args /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        size_t origin[3] = { 0, 0, 0 };
+        size_t region[3] = { image_desc.image_width, 1, 1 };
+
+        error = clEnqueueReadImage(queue, new_image, CL_TRUE, origin, region, 0,
+                                   0, outputData, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadImage failed");
+
+        for (size_t i = 0; i < imageInfo.width; ++i)
+        {
+            if (imageValues_input[i] != outputData[i])
+            {
+                log_error("Data failed to verify: imageValues[%d]=%d != "
+                          "outputData[%d]=%d\n",
+                          i, imageValues_input[i], i, outputData[i]);
+
+                return TEST_FAIL;
+            }
+        }
+
+        return TEST_PASS;
+    }
+
+    cl_mutable_command_khr command = nullptr;
+};
+
+struct MutableDispatchImage2DArguments : public BasicMutableCommandBufferTest
+{
+    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+
+    MutableDispatchImage2DArguments(cl_device_id device, cl_context context,
+                                    cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    virtual cl_int SetUp(int elements) override
+    {
+        BasicMutableCommandBufferTest::SetUp(elements);
+
+        return CL_SUCCESS;
+    }
+
+    bool Skip() override
+    {
+        cl_bool image_support;
+
+        cl_int error =
+            clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT,
+                            sizeof(image_support), &image_support, nullptr);
+        test_error(error, "clGetDeviceInfo for CL_DEVICE_IMAGE_SUPPORT failed");
+
+        cl_mutable_dispatch_fields_khr mutable_capabilities;
+
+        bool mutable_support =
+            !clGetDeviceInfo(
+                device, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR,
+                sizeof(mutable_capabilities), &mutable_capabilities, nullptr)
+            && mutable_capabilities & CL_MUTABLE_DISPATCH_ARGUMENTS_KHR;
+
+        return (!mutable_support || !image_support)
+            || BasicMutableCommandBufferTest::Skip();
+    }
+
+    cl_int Run() override
+    {
+
+        const char *sample_const_arg_kernel =
+            R"(__kernel void sample_test( read_only image2d_t source, sampler_t
+            sampler, write_only image2d_t dest)
+            {
+               int x = get_global_id(0);
+               int y = get_global_id(1);
+
+               int4 color = read_imagei( source, sampler, (int2) (x, y) );
+
+               write_imagei( dest, (int2) (x, y), color );
+            })";
+
+        cl_int error;
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+
+        cl_image_desc image_desc;
+        memset(&image_desc, 0x0, sizeof(cl_image_desc));
+        image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+        image_desc.image_width = 4;
+        image_desc.image_height = 4;
+        image_desc.image_row_pitch = 0;
+        image_desc.num_mip_levels = 0;
+
+        size_t data_size =
+            image_desc.image_width * image_desc.image_height * sizeof(cl_int);
+
+        const cl_image_format formats = { CL_RGBA, CL_UNSIGNED_INT8 };
+
+        image_descriptor imageInfo = { 0 };
+        imageInfo.type = CL_MEM_OBJECT_IMAGE2D;
+        imageInfo.width = 4;
+        imageInfo.height = 4;
+        imageInfo.format = &formats;
+
+        BufferOwningPtr<char> imageValues_input, imageValues_output;
+
+        MTdataHolder d(gRandomSeed);
+        generate_random_image_data(&imageInfo, imageValues_input, d);
+        generate_random_image_data(&imageInfo, imageValues_output, d);
+
+        char *host_ptr_input = (char *)imageValues_input;
+        char *host_ptr_output = (char *)imageValues_output;
+        std::vector<char> outputData(data_size);
+
+        clMemWrapper src_image =
+            create_image_2d(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                            &formats, image_desc.image_width,
+                            image_desc.image_height, 0, host_ptr_input, &error);
+        test_error(error, "create_image_2d failed");
+
+        clMemWrapper dst_image = create_image_2d(
+            context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &formats,
+            image_desc.image_width, image_desc.image_height, 0, host_ptr_output,
+            &error);
+        test_error(error, "create_image_2d failed");
+
+        error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                            &sample_const_arg_kernel,
+                                            "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        clSamplerWrapper sampler = clCreateSampler(
+            context, CL_FALSE, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &error);
+        test_error(error, "Unable to create sampler");
+
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_image);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        error = clSetKernelArg(kernel, 1, sizeof(cl_sampler), &sampler);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        error = clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_image);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        size_t globalDim[3] = { 4, 4, 1 }, localDim[3] = { 1, 1, 1 };
+
+        cl_ndrange_kernel_command_properties_khr props[] = {
+            CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
+            CL_MUTABLE_DISPATCH_ARGUMENTS_KHR, 0
+        };
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, props, kernel, 1, nullptr, globalDim,
+            localDim, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed.");
+
+        clMemWrapper new_image = create_image_2d(
+            context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &formats,
+            image_desc.image_width, image_desc.image_height, 0,
+            imageValues_output, &error);
+        test_error(error, "create_image_2d failed");
+
+        cl_mutable_dispatch_arg_khr arg_2{ 2, sizeof(cl_mem), &new_image };
+        cl_mutable_dispatch_arg_khr args[] = { arg_2 };
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            1 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            args /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        size_t origin[3] = { 0, 0, 0 };
+        size_t region[3] = { image_desc.image_width, image_desc.image_height,
+                             1 };
+
+        error = clEnqueueReadImage(queue, new_image, CL_TRUE, origin, region, 0,
+                                   0, outputData.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadImage failed");
+
+        for (size_t i = 0; i < imageInfo.width * imageInfo.height; ++i)
+        {
+            if (imageValues_input[i] != outputData[i])
+            {
+                log_error("Data failed to verify: imageValues[%d]=%d != "
+                          "outputData[%d]=%d\n",
+                          i, imageValues_input[i], i, outputData[i]);
+                return TEST_FAIL;
+            }
+        }
+
+        return TEST_PASS;
+    }
+
+    cl_mutable_command_khr command = nullptr;
+};
+
+int test_mutable_dispatch_image_1d_arguments(cl_device_id device,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchImage1DArguments>(device, context,
+                                                           queue, num_elements);
+}
+
+int test_mutable_dispatch_image_2d_arguments(cl_device_id device,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchImage2DArguments>(device, context,
+                                                           queue, num_elements);
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_info.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_info.cpp
index cc425a4..61600dc 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_info.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_info.cpp
@@ -42,13 +42,13 @@
 // CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR
 // CL_MUTABLE_COMMAND_COMMAND_TYPE_KHR
 
-struct InfoDeviceQuery : public BasicMutableCommandBufferTest
+struct InfoDeviceQuery : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoDeviceQuery(cl_device_id device, cl_context context,
                     cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -71,12 +71,12 @@
     }
 };
 
-struct InfoBuffer : public BasicMutableCommandBufferTest
+struct InfoBuffer : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoBuffer(cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -108,13 +108,13 @@
     cl_mutable_command_khr command = nullptr;
 };
 
-struct PropertiesArray : public BasicMutableCommandBufferTest
+struct PropertiesArray : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     PropertiesArray(cl_device_id device, cl_context context,
                     cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -140,7 +140,7 @@
         if (size != sizeof(props) || test_props[0] != props[0]
             || test_props[1] != props[1])
         {
-            log_error("ERROR: Incorrect command buffer returned from "
+            log_error("ERROR: Incorrect properties returned from "
                       "clGetMutableCommandInfoKHR.");
             return TEST_FAIL;
         }
@@ -154,12 +154,12 @@
     cl_mutable_command_khr command = nullptr;
 };
 
-struct Kernel : public BasicMutableCommandBufferTest
+struct Kernel : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     Kernel(cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -181,7 +181,7 @@
         // opaque object.
         if (test_kernel != kernel)
         {
-            log_error("ERROR: Incorrect command buffer returned from "
+            log_error("ERROR: Incorrect kernel returned from "
                       "clGetMutableCommandInfoKHR.");
             return TEST_FAIL;
         }
@@ -195,12 +195,12 @@
     cl_mutable_command_khr command = nullptr;
 };
 
-struct Dimensions : public BasicMutableCommandBufferTest
+struct Dimensions : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     Dimensions(cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -210,8 +210,7 @@
             &global_work_size, nullptr, 0, nullptr, nullptr, &command);
         test_error(error, "clCommandNDRangeKernelKHR failed");
 
-        size_t test_dimensions;
-
+        cl_uint test_dimensions = 0;
         error = clGetMutableCommandInfoKHR(
             command, CL_MUTABLE_DISPATCH_DIMENSIONS_KHR,
             sizeof(test_dimensions), &test_dimensions, nullptr);
@@ -219,7 +218,7 @@
 
         if (test_dimensions != dimensions)
         {
-            log_error("ERROR: Incorrect command buffer returned from "
+            log_error("ERROR: Incorrect dimensions returned from "
                       "clGetMutableCommandInfoKHR.");
             return TEST_FAIL;
         }
@@ -234,12 +233,12 @@
     const size_t dimensions = 3;
 };
 
-struct InfoType : public BasicMutableCommandBufferTest
+struct InfoType : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoType(cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -271,12 +270,12 @@
     cl_mutable_command_khr command = nullptr;
 };
 
-struct InfoQueue : public BasicMutableCommandBufferTest
+struct InfoQueue : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoQueue(cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -308,13 +307,13 @@
     cl_mutable_command_khr command = nullptr;
 };
 
-struct InfoGlobalWorkOffset : public BasicMutableCommandBufferTest
+struct InfoGlobalWorkOffset : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoGlobalWorkOffset(cl_device_id device, cl_context context,
                          cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -330,7 +329,7 @@
 
         if (test_global_work_offset != global_work_offset)
         {
-            log_error("ERROR: Wrong size returned from "
+            log_error("ERROR: Wrong global work offset returned from "
                       "clGetMutableCommandInfoKHR.");
             return TEST_FAIL;
         }
@@ -346,13 +345,13 @@
     size_t test_global_work_offset = 0;
 };
 
-struct InfoGlobalWorkSize : public BasicMutableCommandBufferTest
+struct InfoGlobalWorkSize : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoGlobalWorkSize(cl_device_id device, cl_context context,
                        cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -368,7 +367,7 @@
 
         if (test_global_work_size != global_work_size)
         {
-            log_error("ERROR: Wrong size returned from "
+            log_error("ERROR: Wrong global work size returned from "
                       "clGetMutableCommandInfoKHR.");
             return TEST_FAIL;
         }
@@ -383,13 +382,13 @@
     size_t test_global_work_size = 0;
 };
 
-struct InfoLocalWorkSize : public BasicMutableCommandBufferTest
+struct InfoLocalWorkSize : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoLocalWorkSize(cl_device_id device, cl_context context,
                       cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -405,7 +404,7 @@
 
         if (test_local_work_size != local_work_size)
         {
-            log_error("ERROR: Wrong size returned from "
+            log_error("ERROR: Wrong local work size returned from "
                       "clGetMutableCommandInfoKHR.");
             return TEST_FAIL;
         }
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_local_size.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_local_size.cpp
new file mode 100644
index 0000000..22a9da6
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_local_size.cpp
@@ -0,0 +1,174 @@
+//
+// Copyright (c) 2022 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include <extensionHelpers.h>
+#include "typeWrappers.h"
+#include "procs.h"
+#include "testHarness.h"
+#include "mutable_command_basic.h"
+#include <vector>
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// mutable dispatch tests which handle following cases:
+//
+// CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR
+
+struct MutableDispatchLocalSize : public InfoMutableCommandBufferTest
+{
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
+
+    MutableDispatchLocalSize(cl_device_id device, cl_context context,
+                             cl_command_queue queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
+    {}
+
+    bool Skip() override
+    {
+        cl_mutable_dispatch_fields_khr mutable_capabilities;
+
+        bool mutable_support =
+            !clGetDeviceInfo(
+                device, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR,
+                sizeof(mutable_capabilities), &mutable_capabilities, nullptr)
+            && mutable_capabilities & CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR;
+
+        return !mutable_support || InfoMutableCommandBufferTest::Skip();
+    }
+
+    cl_int Run() override
+    {
+        const char *local_size_kernel =
+            R"(
+                __kernel void sample_test(__global int *dst)
+            {
+                size_t tid = get_global_id(0);
+                dst[tid] = get_local_size(0);
+            })";
+
+        cl_int error = create_single_kernel_helper(
+            context, &program, &kernel, 1, &local_size_kernel, "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        clMemWrapper stream;
+        stream = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeToAllocate,
+                                nullptr, &error);
+        test_error(error, "Creating test array failed");
+
+        /* Set the arguments */
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &stream);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, nullptr, kernel, 1, nullptr,
+            &global_work_size, &local_work_size, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed.");
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            0 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            nullptr /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            &update_global_size /* global_work_size */,
+            &update_local_size /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clGetMutableCommandInfoKHR(
+            command, CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR,
+            sizeof(info_local_size), &info_local_size, nullptr);
+        test_error(error, "clGetMutableCommandInfoKHR failed");
+
+        if (info_local_size != update_local_size)
+        {
+            log_error("ERROR: Wrong size returned from "
+                      "clGetMutableCommandInfoKHR.");
+            return TEST_FAIL;
+        }
+
+        std::vector<cl_int> resultData;
+        resultData.resize(num_elements);
+
+        error = clEnqueueReadBuffer(queue, stream, CL_TRUE, 0, sizeToAllocate,
+                                    resultData.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+            if (i < update_global_size && update_local_size != resultData[i])
+            {
+                log_error("Data failed to verify: update_local_size != "
+                          "resultData[%d]=%d\n",
+                          i, resultData[i]);
+                return TEST_FAIL;
+            }
+            else if (i >= update_global_size
+                     && local_work_size != resultData[i])
+            {
+                log_error("Data failed to verify: update_local_size != "
+                          "resultData[%d]=%d\n",
+                          i, resultData[i]);
+                return TEST_FAIL;
+            }
+
+        return CL_SUCCESS;
+    }
+
+    size_t info_local_size = 0;
+    const size_t global_work_size = 16;
+    const size_t local_work_size = 8;
+    const size_t update_global_size = 8;
+    const size_t update_local_size = 4;
+    const size_t sizeToAllocate = 64;
+    const size_t num_elements = sizeToAllocate / sizeof(cl_int);
+
+    cl_mutable_command_khr command = nullptr;
+};
+
+int test_mutable_dispatch_local_size(cl_device_id device, cl_context context,
+                                     cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchLocalSize>(device, context, queue,
+                                                    num_elements);
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_out_of_order.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_out_of_order.cpp
new file mode 100644
index 0000000..d507dad
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_out_of_order.cpp
@@ -0,0 +1,454 @@
+//
+// Copyright (c) 2022 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include <extensionHelpers.h>
+#include <vector>
+#include "mutable_command_basic.h"
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+////////////////////////////////////////////////////////////////////////////////
+// mutable dispatch tests which handle following cases:
+// - simultaneous use
+// - cross-queue simultaneous-use
+
+namespace {
+
+template <bool simultaneous_request>
+struct OutOfOrderTest : public BasicMutableCommandBufferTest
+{
+    OutOfOrderTest(cl_device_id device, cl_context context,
+                   cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue),
+          out_of_order_queue(nullptr), out_of_order_command_buffer(this),
+          user_event(nullptr), wait_pass_event(nullptr), kernel_fill(nullptr),
+          program_fill(nullptr)
+    {
+        simultaneous_use_requested = simultaneous_request;
+        if (simultaneous_request) buffer_size_multiplier = 2;
+    }
+
+    //--------------------------------------------------------------------------
+    cl_int SetUpKernel() override
+    {
+        cl_int error = BasicMutableCommandBufferTest::SetUpKernel();
+        test_error(error, "BasicMutableCommandBufferTest::SetUpKernel failed");
+
+        // create additional kernel to properly prepare output buffer for test
+        const char* kernel_str =
+            R"(
+          __kernel void fill(int pattern, __global int* out, __global int*
+        offset)
+          {
+              size_t id = get_global_id(0);
+              size_t ind = offset[0] + id ;
+              out[ind] = pattern;
+          })";
+
+        error = create_single_kernel_helper_create_program(
+            context, &program_fill, 1, &kernel_str);
+        test_error(error, "Failed to create program with source");
+
+        error =
+            clBuildProgram(program_fill, 1, &device, nullptr, nullptr, nullptr);
+        test_error(error, "Failed to build program");
+
+        kernel_fill = clCreateKernel(program_fill, "fill", &error);
+        test_error(error, "Failed to create copy kernel");
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    cl_int SetUpKernelArgs() override
+    {
+        cl_int error = BasicMutableCommandBufferTest::SetUpKernelArgs();
+        test_error(error,
+                   "BasicMutableCommandBufferTest::SetUpKernelArgs failed");
+
+        error = clSetKernelArg(kernel_fill, 0, sizeof(cl_int),
+                               &overwritten_pattern);
+        test_error(error, "clSetKernelArg failed");
+
+        error = clSetKernelArg(kernel_fill, 1, sizeof(out_mem), &out_mem);
+        test_error(error, "clSetKernelArg failed");
+
+        error = clSetKernelArg(kernel_fill, 2, sizeof(off_mem), &off_mem);
+        test_error(error, "clSetKernelArg failed");
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    cl_int SetUp(int elements) override
+    {
+        cl_int error = BasicMutableCommandBufferTest::SetUp(elements);
+        test_error(error, "BasicMutableCommandBufferTest::SetUp failed");
+
+        error = SetUpKernel();
+        test_error(error, "SetUpKernel failed");
+
+        out_of_order_queue = clCreateCommandQueue(
+            context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &error);
+        test_error(error, "Unable to create command queue to test with");
+
+        cl_command_buffer_properties_khr properties[3] = {
+            CL_COMMAND_BUFFER_FLAGS_KHR, CL_COMMAND_BUFFER_MUTABLE_KHR, 0
+        };
+
+        out_of_order_command_buffer = clCreateCommandBufferKHR(
+            1, &out_of_order_queue, properties, &error);
+        test_error(error, "clCreateCommandBufferKHR failed");
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    bool Skip() override
+    {
+        cl_mutable_dispatch_fields_khr mutable_capabilities;
+
+        bool mutable_support =
+            !clGetDeviceInfo(
+                device, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR,
+                sizeof(mutable_capabilities), &mutable_capabilities, nullptr)
+            && mutable_capabilities & CL_MUTABLE_DISPATCH_ARGUMENTS_KHR;
+
+
+        return !out_of_order_support
+            || (simultaneous_use_requested && !simultaneous_use_support)
+            || !mutable_support || BasicMutableCommandBufferTest::Skip();
+    }
+
+    //--------------------------------------------------------------------------
+    cl_int Run() override
+    {
+        cl_int error = CL_SUCCESS;
+
+        if (simultaneous_use_support)
+        {
+            // enqueue simultaneous command-buffers with out-of-order calls
+            error = RunSimultaneous();
+            test_error(error, "RunSimultaneous failed");
+        }
+        else
+        {
+            // enqueue single command-buffer with out-of-order calls
+            error = RunSingle();
+            test_error(error, "RunSingle failed");
+        }
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    cl_int RecordCommandBuffer()
+    {
+        cl_sync_point_khr sync_points[2];
+        const cl_int pattern = pattern_pri;
+        cl_int error =
+            clCommandFillBufferKHR(out_of_order_command_buffer, nullptr, in_mem,
+                                   &pattern, sizeof(cl_int), 0, data_size(), 0,
+                                   nullptr, &sync_points[0], nullptr);
+        test_error(error, "clCommandFillBufferKHR failed");
+
+        error = clCommandFillBufferKHR(out_of_order_command_buffer, nullptr,
+                                       out_mem, &overwritten_pattern,
+                                       sizeof(cl_int), 0, data_size(), 0,
+                                       nullptr, &sync_points[1], nullptr);
+        test_error(error, "clCommandFillBufferKHR failed");
+
+        error = clCommandNDRangeKernelKHR(
+            out_of_order_command_buffer, nullptr, nullptr, kernel, 1, nullptr,
+            &num_elements, nullptr, 2, sync_points, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(out_of_order_command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    cl_int RunSingle()
+    {
+        cl_int error;
+
+        error = RecordCommandBuffer();
+        test_error(error, "RecordCommandBuffer failed");
+
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, out_of_order_command_buffer, 0, nullptr, &single_event);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_int> output_data(num_elements);
+        error = clEnqueueReadBuffer(out_of_order_queue, out_mem, CL_TRUE, 0,
+                                    data_size(), output_data.data(), 1,
+                                    &single_event, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_pri, output_data[i], i);
+        }
+
+        clMemWrapper new_out_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                                                  sizeof(cl_int) * num_elements
+                                                      * buffer_size_multiplier,
+                                                  nullptr, &error);
+        test_error(error, "clCreateBuffer failed");
+
+        cl_mutable_dispatch_arg_khr arg_1{ 1, sizeof(new_out_mem),
+                                           &new_out_mem };
+        cl_mutable_dispatch_arg_khr args[] = { arg_1 };
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            1 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            args /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clUpdateMutableCommandsKHR(out_of_order_command_buffer,
+                                           &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, out_of_order_command_buffer, 0, nullptr, &single_event);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clEnqueueReadBuffer(out_of_order_queue, new_out_mem, CL_TRUE, 0,
+                                    data_size(), output_data.data(), 1,
+                                    &single_event, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_pri, output_data[i], i);
+        }
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    cl_int RecordSimultaneousCommandBuffer()
+    {
+        cl_sync_point_khr sync_points[2];
+        // for both simultaneous passes this call will fill entire in_mem buffer
+        cl_int error = clCommandFillBufferKHR(
+            out_of_order_command_buffer, nullptr, in_mem, &pattern_pri,
+            sizeof(cl_int), 0, data_size() * buffer_size_multiplier, 0, nullptr,
+            &sync_points[0], nullptr);
+        test_error(error, "clCommandFillBufferKHR failed");
+
+        // to avoid overwriting the entire result buffer instead of filling
+        // only relevant part this additional kernel was introduced
+
+        error = clCommandNDRangeKernelKHR(out_of_order_command_buffer, nullptr,
+                                          nullptr, kernel_fill, 1, nullptr,
+                                          &num_elements, nullptr, 0, nullptr,
+                                          &sync_points[1], &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clCommandNDRangeKernelKHR(
+            out_of_order_command_buffer, nullptr, nullptr, kernel, 1, nullptr,
+            &num_elements, nullptr, 2, sync_points, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(out_of_order_command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    struct SimulPassData
+    {
+        cl_int offset;
+        std::vector<cl_int> output_buffer;
+        // 0:user event, 1:offset-buffer fill event, 2:kernel done event
+        clEventWrapper wait_events[3];
+    };
+
+    //--------------------------------------------------------------------------
+    cl_int EnqueueSimultaneousPass(SimulPassData& pd)
+    {
+        cl_int error = CL_SUCCESS;
+        if (!user_event)
+        {
+            user_event = clCreateUserEvent(context, &error);
+            test_error(error, "clCreateUserEvent failed");
+        }
+
+        pd.wait_events[0] = user_event;
+
+        // filling offset buffer must wait for previous pass completeness
+        error = clEnqueueFillBuffer(
+            out_of_order_queue, off_mem, &pd.offset, sizeof(cl_int), 0,
+            sizeof(cl_int), (wait_pass_event != nullptr ? 1 : 0),
+            (wait_pass_event != nullptr ? &wait_pass_event : nullptr),
+            &pd.wait_events[1]);
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        // command buffer execution must wait for two wait-events
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, out_of_order_command_buffer, 2, &pd.wait_events[0],
+            &pd.wait_events[2]);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clEnqueueReadBuffer(out_of_order_queue, out_mem, CL_FALSE,
+                                    pd.offset * sizeof(cl_int), data_size(),
+                                    pd.output_buffer.data(), 1,
+                                    &pd.wait_events[2], nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        clMemWrapper new_out_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                                                  sizeof(cl_int) * num_elements
+                                                      * buffer_size_multiplier,
+                                                  nullptr, &error);
+        test_error(error, "clCreateBuffer failed");
+
+        cl_mutable_dispatch_arg_khr arg_1{ 1, sizeof(new_out_mem),
+                                           &new_out_mem };
+        cl_mutable_dispatch_arg_khr args[] = { arg_1 };
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            1 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            args /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clUpdateMutableCommandsKHR(out_of_order_command_buffer,
+                                           &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        // command buffer execution must wait for two wait-events
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, out_of_order_command_buffer, 2, &pd.wait_events[0],
+            &pd.wait_events[2]);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clEnqueueReadBuffer(out_of_order_queue, new_out_mem, CL_FALSE,
+                                    pd.offset * sizeof(cl_int), data_size(),
+                                    pd.output_buffer.data(), 1,
+                                    &pd.wait_events[2], nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    cl_int RunSimultaneous()
+    {
+        cl_int error = RecordSimultaneousCommandBuffer();
+        test_error(error, "RecordSimultaneousCommandBuffer failed");
+
+        cl_int offset = static_cast<cl_int>(num_elements);
+
+        std::vector<SimulPassData> simul_passes = {
+            { 0, std::vector<cl_int>(num_elements) },
+            { offset, std::vector<cl_int>(num_elements) }
+        };
+
+        for (auto&& pass : simul_passes)
+        {
+            error = EnqueueSimultaneousPass(pass);
+            test_error(error, "EnqueueSimultaneousPass failed");
+
+            wait_pass_event = pass.wait_events[2];
+        }
+
+        error = clSetUserEventStatus(user_event, CL_COMPLETE);
+        test_error(error, "clSetUserEventStatus failed");
+
+        error = clFinish(out_of_order_queue);
+        test_error(error, "clFinish failed");
+
+        // verify the result buffers
+        for (auto&& pass : simul_passes)
+        {
+            auto& res_data = pass.output_buffer;
+            for (size_t i = 0; i < num_elements; i++)
+            {
+                CHECK_VERIFICATION_ERROR(pattern_pri, res_data[i], i);
+            }
+        }
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    clCommandQueueWrapper out_of_order_queue;
+    clCommandBufferWrapper out_of_order_command_buffer;
+
+    clEventWrapper user_event;
+    clEventWrapper single_event;
+    clEventWrapper wait_pass_event;
+
+    clKernelWrapper kernel_fill;
+    clProgramWrapper program_fill;
+
+    const size_t test_global_work_size = 3 * sizeof(cl_int);
+    cl_mutable_command_khr command = nullptr;
+
+    const cl_int overwritten_pattern = 0xACDC;
+    const cl_int pattern_pri = 42;
+};
+
+} // anonymous namespace
+
+int test_mutable_dispatch_out_of_order(cl_device_id device, cl_context context,
+                                       cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<OutOfOrderTest<false>>(device, context, queue,
+                                                 num_elements);
+}
+
+int test_mutable_dispatch_simultaneous_out_of_order(cl_device_id device,
+                                                    cl_context context,
+                                                    cl_command_queue queue,
+                                                    int num_elements)
+{
+    return MakeAndRunTest<OutOfOrderTest<true>>(device, context, queue,
+                                                num_elements);
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/procs.h b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/procs.h
index 08512ca..1db4891 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/procs.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/procs.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_PROCS_H
-#define _CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_PROCS_H
+#ifndef CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_PROCS_H
+#define CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_PROCS_H
 
 #include <CL/cl.h>
 
@@ -59,4 +59,51 @@
                                                       cl_context context,
                                                       cl_command_queue queue,
                                                       int num_elements);
+extern int test_mutable_dispatch_image_1d_arguments(cl_device_id device,
+                                                    cl_context context,
+                                                    cl_command_queue queue,
+                                                    int num_elements);
+extern int test_mutable_dispatch_image_2d_arguments(cl_device_id device,
+                                                    cl_context context,
+                                                    cl_command_queue queue,
+                                                    int num_elements);
+extern int test_mutable_dispatch_global_arguments(cl_device_id device,
+                                                  cl_context context,
+                                                  cl_command_queue queue,
+                                                  int num_elements);
+extern int test_mutable_dispatch_local_arguments(cl_device_id device,
+                                                 cl_context context,
+                                                 cl_command_queue queue,
+                                                 int num_elements);
+extern int test_mutable_dispatch_pod_arguments(cl_device_id device,
+                                               cl_context context,
+                                               cl_command_queue queue,
+                                               int num_elements);
+extern int test_mutable_dispatch_null_arguments(cl_device_id device,
+                                                cl_context context,
+                                                cl_command_queue queue,
+                                                int num_elements);
+extern int test_mutable_dispatch_svm_arguments(cl_device_id device,
+                                               cl_context context,
+                                               cl_command_queue queue,
+                                               int num_elements);
+extern int test_mutable_dispatch_out_of_order(cl_device_id device,
+                                              cl_context context,
+                                              cl_command_queue queue,
+                                              int num_elements);
+extern int test_mutable_dispatch_simultaneous_out_of_order(
+    cl_device_id device, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int test_mutable_dispatch_global_size(cl_device_id device,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements);
+extern int test_mutable_dispatch_local_size(cl_device_id device,
+                                            cl_context context,
+                                            cl_command_queue queue,
+                                            int num_elements);
+extern int test_mutable_dispatch_global_offset(cl_device_id device,
+                                               cl_context context,
+                                               cl_command_queue queue,
+                                               int num_elements);
 #endif /*_CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_PROCS_H*/
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_event_sync.cpp b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_event_sync.cpp
index be8530b..6ef26bb 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_event_sync.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_event_sync.cpp
@@ -603,12 +603,15 @@
                                     event_ptrs[1], nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
-        error = clFinish(queue);
-        test_error(error, "clFinish failed");
+        error = clFlush(queue);
+        test_error(error, "clFlush failed");
 
         error = clFinish(queue_sec);
         test_error(error, "clFinish failed");
 
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+
         // verify the result - result buffer must contain initial pattern
         for (size_t i = 0; i < num_elements; i++)
         {
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_finalize.cpp b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_finalize.cpp
new file mode 100644
index 0000000..bd66916
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_finalize.cpp
@@ -0,0 +1,85 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "basic_command_buffer.h"
+#include "procs.h"
+
+namespace {
+
+// Test that finalizing a command-buffer that has already been finalized returns
+// the correct error code.
+struct FinalizeInvalid : public BasicCommandBufferTest
+{
+    using BasicCommandBufferTest::BasicCommandBufferTest;
+
+    cl_int Run() override
+    {
+        cl_int error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements,
+            nullptr, 0, nullptr, nullptr, nullptr);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        // Finalizing an already finalized command-buffer must return
+        // CL_INVALID_OPERATION
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_failure_error_ret(
+            error, CL_INVALID_OPERATION,
+            "clFinalizeCommandBufferKHR should return CL_INVALID_OPERATION",
+            TEST_FAIL);
+
+        return CL_SUCCESS;
+    }
+};
+
+// Check that an empty command-buffer can be finalized and then executed.
+struct FinalizeEmpty : public BasicCommandBufferTest
+{
+    using BasicCommandBufferTest::BasicCommandBufferTest;
+
+    cl_int Run() override
+    {
+        // Finalize an empty command-buffer
+        cl_int error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        // Execute empty command-buffer and then wait to complete
+        clEventWrapper event;
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, &event);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clWaitForEvents(1, &event);
+        test_error(error, "clWaitForEvents failed");
+
+        return CL_SUCCESS;
+    }
+};
+} // anonymous namespace
+
+int test_finalize_invalid(cl_device_id device, cl_context context,
+                          cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<FinalizeInvalid>(device, context, queue,
+                                           num_elements);
+}
+
+int test_finalize_empty(cl_device_id device, cl_context context,
+                        cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<FinalizeEmpty>(device, context, queue, num_elements);
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_get_command_buffer_info.cpp b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_get_command_buffer_info.cpp
index 3ce410c..6344197 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_get_command_buffer_info.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_get_command_buffer_info.cpp
@@ -26,6 +26,7 @@
     CITM_REF_COUNT,
     CITM_STATE,
     CITM_PROP_ARRAY,
+    CITM_CONTEXT,
 };
 
 namespace {
@@ -38,6 +39,7 @@
 // -test case for CL_COMMAND_BUFFER_REFERENCE_COUNT_KHR query
 // -test case for CL_COMMAND_BUFFER_STATE_KHR query
 // -test case for CL_COMMAND_BUFFER_PROPERTIES_ARRAY_KHR query
+// -test case for CL_COMMAND_BUFFER_CONTEXT_KHR query
 
 template <CombufInfoTestMode test_mode>
 struct CommandBufferGetCommandBufferInfo : public BasicCommandBufferTest
@@ -70,6 +72,10 @@
                 error = RunPropArrayInfoTest();
                 test_error(error, "RunPropArrayInfoTest failed");
                 break;
+            case CombufInfoTestMode::CITM_CONTEXT:
+                error = RunContextInfoTest();
+                test_error(error, "RunContextInfoTest failed");
+                break;
         }
 
         return CL_SUCCESS;
@@ -130,7 +136,7 @@
 
         // We can not check if this is the right queue because this is an opaque
         // object, test against NULL.
-        for (int i = 0; i < queue_list.size(); i++)
+        for (size_t i = 0; i < queue_list.size(); i++)
         {
             test_assert_error(
                 queue_list[i] == queue,
@@ -205,8 +211,7 @@
 
         // lambda to verify given state
         auto verify_state = [&](const cl_command_buffer_state_khr &expected) {
-            cl_command_buffer_state_khr state =
-                CL_COMMAND_BUFFER_STATE_INVALID_KHR;
+            cl_command_buffer_state_khr state = ~cl_command_buffer_state_khr(0);
 
             cl_int error = clGetCommandBufferInfoKHR(
                 command_buffer, CL_COMMAND_BUFFER_STATE_KHR, sizeof(state),
@@ -240,9 +245,10 @@
         clEventWrapper trigger_event = clCreateUserEvent(context, &error);
         test_error(error, "clCreateUserEvent failed");
 
+        clEventWrapper execute_event;
         // enqueued command buffer blocked on user event
         error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 1,
-                                          &trigger_event, nullptr);
+                                          &trigger_event, &execute_event);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
         // verify pending state
@@ -255,6 +261,13 @@
 
         test_error(signal_error, "clSetUserEventStatus failed");
 
+        error = clWaitForEvents(1, &execute_event);
+        test_error(error, "Unable to wait for execute event");
+
+        // verify executable state
+        error = verify_state(CL_COMMAND_BUFFER_STATE_EXECUTABLE_KHR);
+        test_error(error, "verify_state failed");
+
         return CL_SUCCESS;
     }
 
@@ -315,6 +328,46 @@
         return TEST_FAIL;
     }
 
+    cl_int RunContextInfoTest()
+    {
+        cl_int error = TEST_PASS;
+
+        // record command buffers
+        error = RecordCommandBuffer();
+        test_error(error, "RecordCommandBuffer failed");
+
+        size_t ret_value_size = 0;
+        error = clGetCommandBufferInfoKHR(command_buffer,
+                                          CL_COMMAND_BUFFER_CONTEXT_KHR, 0,
+                                          nullptr, &ret_value_size);
+        test_error(error, "clGetCommandBufferInfoKHR failed");
+
+        test_assert_error(
+            ret_value_size == sizeof(cl_context),
+            "Unexpected result of CL_COMMAND_BUFFER_CONTEXT_KHR query!");
+
+        cl_context ret_context = nullptr;
+        error = clGetCommandBufferInfoKHR(
+            command_buffer, CL_COMMAND_BUFFER_CONTEXT_KHR, sizeof(cl_context),
+            &ret_context, nullptr);
+        test_error(error, "clGetCommandBufferInfoKHR failed");
+        test_assert_error(
+            ret_context != nullptr,
+            "Unexpected result of CL_COMMAND_BUFFER_CONTEXT_KHR query!");
+
+        cl_context expected_context = nullptr;
+        error =
+            clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context),
+                                  &expected_context, nullptr);
+        test_error(error, "clGetCommandQueueInfo failed");
+
+        test_assert_error(
+            ret_context == expected_context,
+            "Unexpected result of CL_COMMAND_BUFFER_CONTEXT_KHR query!");
+
+        return TEST_PASS;
+    }
+
     const cl_int pattern = 0xE;
 };
 
@@ -352,3 +405,11 @@
         CommandBufferGetCommandBufferInfo<CombufInfoTestMode::CITM_PROP_ARRAY>>(
         device, context, queue, num_elements);
 }
+
+int test_info_context(cl_device_id device, cl_context context,
+                      cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<
+        CommandBufferGetCommandBufferInfo<CombufInfoTestMode::CITM_CONTEXT>>(
+        device, context, queue, num_elements);
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_profiling.cpp b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_profiling.cpp
index 28d8045..c06bbf7 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_profiling.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_profiling.cpp
@@ -160,7 +160,7 @@
 
         // verify the results by comparing timestamps
         bool all_vals_0 = prof_params.front().value != 0;
-        for (int i = 1; i < prof_params.size(); i++)
+        for (size_t i = 1; i < prof_params.size(); i++)
         {
             all_vals_0 = (prof_params[i].value != 0) ? false : all_vals_0;
             if (prof_params[i - 1].value > prof_params[i].value)
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_barrier.cpp b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_barrier.cpp
index d73fc9c..82ff16f 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_barrier.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_barrier.cpp
@@ -70,15 +70,42 @@
             0, nullptr, out_of_order_command_buffer, 0, nullptr, &event);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_int> output_data(num_elements);
+        std::vector<cl_int> output_data_1(num_elements);
         error = clEnqueueReadBuffer(out_of_order_queue, out_mem, CL_TRUE, 0,
-                                    data_size(), output_data.data(), 1, &event,
-                                    nullptr);
+                                    data_size(), output_data_1.data(), 1,
+                                    &event, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < num_elements; i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error =
+            clEnqueueFillBuffer(queue, in_mem, &zero_pattern, sizeof(cl_int), 0,
+                                data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBufferKHR failed");
+
+        error =
+            clEnqueueFillBuffer(queue, out_mem, &zero_pattern, sizeof(cl_int),
+                                0, data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, out_of_order_command_buffer, 0, nullptr, &event);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_int> output_data_2(num_elements);
+        error = clEnqueueReadBuffer(out_of_order_queue, out_mem, CL_TRUE, 0,
+                                    data_size(), output_data_2.data(), 1,
+                                    &event, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -106,6 +133,7 @@
     }
 
     const cl_int pattern = 0x16;
+    const cl_int zero_pattern = 0x0;
     clCommandQueueWrapper out_of_order_queue;
     clCommandBufferWrapper out_of_order_command_buffer;
     clEventWrapper event;
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_base.h b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_base.h
index 0fd2e4e..48abe25 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_base.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_base.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef _CL_KHR_COMMAND_BUFFER_TEST_BASE_H
-#define _CL_KHR_COMMAND_BUFFER_TEST_BASE_H
+#ifndef CL_KHR_COMMAND_BUFFER_TEST_BASE_H
+#define CL_KHR_COMMAND_BUFFER_TEST_BASE_H
 
 #include <CL/cl_ext.h>
 #include "harness/deviceInfo.h"
@@ -174,4 +174,4 @@
     }
 
 
-#endif // _CL_KHR_COMMAND_BUFFER_TEST_BASE_H
+#endif // CL_KHR_COMMAND_BUFFER_TEST_BASE_H
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_copy.cpp b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_copy.cpp
index 102ae76..0a30e76 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_copy.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_copy.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 #include "basic_command_buffer.h"
+#include "svm_command_basic.h"
 #include "harness/typeWrappers.h"
 #include "procs.h"
 
@@ -38,7 +39,7 @@
     cl_int Run() override
     {
         cl_int error = clCommandFillImageKHR(command_buffer, nullptr, src_image,
-                                             fill_color, origin, region, 0,
+                                             fill_color_1, origin, region, 0,
                                              nullptr, nullptr, nullptr);
 
         test_error(error, "clCommandFillImageKHR failed");
@@ -56,13 +57,38 @@
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size);
-        error = clEnqueueReadImage(queue, dst_image, CL_TRUE, origin, region, 0,
-                                   0, output_data.data(), 0, nullptr, nullptr);
+        std::vector<cl_char> output_data_1(data_size);
+        error =
+            clEnqueueReadImage(queue, dst_image, CL_TRUE, origin, region, 0, 0,
+                               output_data_1.data(), 0, nullptr, nullptr);
 
         for (size_t i = 0; i < data_size; i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error = clEnqueueFillImage(queue, src_image, fill_color_2, origin,
+                                   region, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillImageKHR failed");
+
+        error = clEnqueueFillImage(queue, dst_image, fill_color_2, origin,
+                                   region, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillImageKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size);
+        error =
+            clEnqueueReadImage(queue, dst_image, CL_TRUE, origin, region, 0, 0,
+                               output_data_2.data(), 0, nullptr, nullptr);
+
+        for (size_t i = 0; i < data_size; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -97,8 +123,12 @@
     const size_t data_size = img_width * img_height * 4 * sizeof(cl_char);
     const size_t origin[3] = { 0, 0, 0 },
                  region[3] = { img_width, img_height, 1 };
-    const cl_uint pattern = 0x05;
-    const cl_uint fill_color[4] = { pattern, pattern, pattern, pattern };
+    const cl_uint pattern_1 = 0x05;
+    const cl_uint fill_color_1[4] = { pattern_1, pattern_1, pattern_1,
+                                      pattern_1 };
+    const cl_uint pattern_2 = 0x1;
+    const cl_uint fill_color_2[4] = { pattern_2, pattern_2, pattern_2,
+                                      pattern_2 };
     const cl_image_format formats = { CL_RGBA, CL_UNSIGNED_INT8 };
     clMemWrapper src_image;
     clMemWrapper dst_image;
@@ -111,7 +141,7 @@
     cl_int Run() override
     {
         cl_int error = clCommandFillBufferKHR(
-            command_buffer, nullptr, in_mem, &pattern, sizeof(cl_char), 0,
+            command_buffer, nullptr, in_mem, &pattern_1, sizeof(cl_char), 0,
             data_size(), 0, nullptr, nullptr, nullptr);
         test_error(error, "clCommandFillBufferKHR failed");
 
@@ -127,20 +157,113 @@
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size());
+        std::vector<cl_char> output_data_1(data_size());
         error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
-                                    output_data.data(), 0, nullptr, nullptr);
+                                    output_data_1.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < data_size(); i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error = clEnqueueFillBuffer(queue, in_mem, &pattern_2, sizeof(cl_char),
+                                    0, data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBufferKHR failed");
+
+        error = clEnqueueFillBuffer(queue, out_mem, &pattern_2, sizeof(cl_char),
+                                    0, data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size());
+        error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
+                                    output_data_2.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < data_size(); i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
     }
 
-    const cl_char pattern = 0x14;
+    const cl_char pattern_1 = 0x14;
+    const cl_char pattern_2 = 0x28;
+};
+
+struct CopySVMBufferKHR : public BasicSVMCommandBufferTest
+{
+    using BasicSVMCommandBufferTest::BasicSVMCommandBufferTest;
+
+    cl_int Run() override
+    {
+        cl_int error = clCommandSVMMemFillKHR(
+            command_buffer, nullptr, svm_in_mem(), &pattern_1, sizeof(cl_char),
+            data_size(), 0, nullptr, nullptr, nullptr);
+        test_error(error, "clCommandSVMMemFillKHR failed");
+
+        error = clCommandSVMMemcpyKHR(command_buffer, nullptr, svm_out_mem(),
+                                      svm_in_mem(), data_size(), 0, nullptr,
+                                      nullptr, nullptr);
+        test_error(error, "clCommandSVMMemcpyKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_1(data_size());
+        error =
+            clEnqueueSVMMemcpy(queue, CL_TRUE, output_data_1.data(),
+                               svm_out_mem(), data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueSVMMemcpy failed");
+
+        for (size_t i = 0; i < data_size(); i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+        error = clEnqueueSVMMemFill(queue, svm_in_mem(), &pattern_2,
+                                    sizeof(cl_char), data_size(), 0, nullptr,
+                                    nullptr);
+        test_error(error, "clEnqueueSVMMemFill failed");
+
+        error = clEnqueueSVMMemFill(queue, svm_out_mem(), &pattern_2,
+                                    sizeof(cl_char), data_size(), 0, nullptr,
+                                    nullptr);
+        test_error(error, "clEnqueueSVMMemFill failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size());
+
+        error =
+            clEnqueueSVMMemcpy(queue, CL_TRUE, output_data_2.data(),
+                               svm_out_mem(), data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueSVMMemcpy failed");
+
+        for (size_t i = 0; i < data_size(); i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
+        }
+
+        return CL_SUCCESS;
+    }
+
+    const cl_char pattern_1 = 0x14;
+    const cl_char pattern_2 = 0x28;
 };
 
 struct CopyBufferToImageKHR : public BasicCommandBufferTest
@@ -150,7 +273,7 @@
     cl_int Run() override
     {
         cl_int error = clCommandFillBufferKHR(
-            command_buffer, nullptr, buffer, &pattern, sizeof(cl_char), 0,
+            command_buffer, nullptr, buffer, &pattern_1, sizeof(cl_char), 0,
             data_size, 0, nullptr, nullptr, nullptr);
 
         test_error(error, "clCommandFillBufferKHR failed");
@@ -168,15 +291,40 @@
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size);
+        std::vector<cl_char> output_data_1(data_size);
 
         error = clEnqueueReadImage(queue, image, CL_TRUE, origin, region, 0, 0,
-                                   output_data.data(), 0, nullptr, nullptr);
+                                   output_data_1.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadImage failed");
 
         for (size_t i = 0; i < data_size; i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error = clEnqueueFillBuffer(queue, buffer, &pattern_2, sizeof(cl_char),
+                                    0, data_size, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        error = clEnqueueFillImage(queue, image, &fill_color_2, origin, region,
+                                   0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillImage failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size);
+
+        error = clEnqueueReadImage(queue, image, CL_TRUE, origin, region, 0, 0,
+                                   output_data_2.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadImage failed");
+
+        for (size_t i = 0; i < data_size; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -211,7 +359,14 @@
     const size_t data_size = img_width * img_height * 4 * sizeof(cl_char);
     const size_t origin[3] = { 0, 0, 0 },
                  region[3] = { img_width, img_height, 1 };
-    const cl_char pattern = 0x11;
+    const cl_char pattern_1 = 0x11;
+    const cl_char pattern_2 = 0x22;
+
+    const cl_uint fill_color_2[4] = { static_cast<cl_uint>(pattern_2),
+                                      static_cast<cl_uint>(pattern_2),
+                                      static_cast<cl_uint>(pattern_2),
+                                      static_cast<cl_uint>(pattern_2) };
+
     const cl_image_format formats = { CL_RGBA, CL_UNSIGNED_INT8 };
 
     clMemWrapper buffer;
@@ -225,7 +380,7 @@
     cl_int Run() override
     {
         cl_int error =
-            clCommandFillImageKHR(command_buffer, nullptr, image, fill_color,
+            clCommandFillImageKHR(command_buffer, nullptr, image, fill_color_1,
                                   origin, region, 0, nullptr, nullptr, nullptr);
 
         test_error(error, "clCommandFillImageKHR failed");
@@ -243,16 +398,39 @@
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size);
+        std::vector<cl_char> output_data_1(data_size);
 
         error = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, data_size,
-                                    output_data.data(), 0, nullptr, nullptr);
+                                    output_data_1.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < data_size; i++)
         {
-            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern),
-                                     output_data[i], i);
+            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern_1),
+                                     output_data_1[i], i);
+        }
+
+        error = clEnqueueFillImage(queue, image, fill_color_2, origin, region,
+                                   0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillImage failed");
+
+        error = clEnqueueFillBuffer(queue, buffer, &pattern_2, sizeof(cl_char),
+                                    0, data_size, 0, nullptr, nullptr);
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size);
+
+        error = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, data_size,
+                                    output_data_2.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < data_size; i++)
+        {
+            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern_1),
+                                     output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -287,8 +465,12 @@
     const size_t data_size = img_width * img_height * 4 * sizeof(cl_char);
     const size_t origin[3] = { 0, 0, 0 },
                  region[3] = { img_width, img_height, 1 };
-    const cl_uint pattern = 0x12;
-    const cl_uint fill_color[4] = { pattern, pattern, pattern, pattern };
+    const cl_uint pattern_1 = 0x12;
+    const cl_uint fill_color_1[4] = { pattern_1, pattern_1, pattern_1,
+                                      pattern_1 };
+    const cl_uint pattern_2 = 0x24;
+    const cl_uint fill_color_2[4] = { pattern_2, pattern_2, pattern_2,
+                                      pattern_2 };
     const cl_image_format formats = { CL_RGBA, CL_UNSIGNED_INT8 };
 
     clMemWrapper image;
@@ -302,7 +484,7 @@
     cl_int Run() override
     {
         cl_int error = clCommandFillBufferKHR(
-            command_buffer, nullptr, in_mem, &pattern, sizeof(cl_char), 0,
+            command_buffer, nullptr, in_mem, &pattern_1, sizeof(cl_char), 0,
             data_size, 0, nullptr, nullptr, nullptr);
         test_error(error, "clCommandFillBufferKHR failed");
 
@@ -319,14 +501,38 @@
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size);
+        std::vector<cl_char> output_data_1(data_size);
         error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size,
-                                    output_data.data(), 0, nullptr, nullptr);
+                                    output_data_1.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < data_size; i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error = clEnqueueFillBuffer(queue, in_mem, &pattern_2, sizeof(cl_char),
+                                    0, data_size, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        error = clEnqueueFillBuffer(queue, out_mem, &pattern_2, sizeof(cl_char),
+                                    0, data_size, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size);
+        error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size,
+                                    output_data_2.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < data_size; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -353,7 +559,8 @@
     const size_t data_size = img_width * img_height * sizeof(cl_char);
     const size_t origin[3] = { 0, 0, 0 },
                  region[3] = { img_width, img_height, 1 };
-    const cl_char pattern = 0x13;
+    const cl_char pattern_1 = 0x13;
+    const cl_char pattern_2 = 0x26;
 
     clMemWrapper in_mem;
     clMemWrapper out_mem;
@@ -372,6 +579,14 @@
     return MakeAndRunTest<CopyBufferKHR>(device, context, queue, num_elements);
 }
 
+int test_copy_svm_buffer(cl_device_id device, cl_context context,
+                         cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<CopySVMBufferKHR>(device, context, queue,
+                                            num_elements);
+}
+
+
 int test_copy_buffer_to_image(cl_device_id device, cl_context context,
                               cl_command_queue queue, int num_elements)
 {
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_fill.cpp b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_fill.cpp
index 88e97a2..67809cf 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_fill.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_fill.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 #include "basic_command_buffer.h"
+#include "svm_command_basic.h"
 #include "harness/typeWrappers.h"
 #include "procs.h"
 
@@ -35,7 +36,7 @@
     cl_int Run() override
     {
         cl_int error =
-            clCommandFillImageKHR(command_buffer, nullptr, image, fill_color,
+            clCommandFillImageKHR(command_buffer, nullptr, image, fill_color_1,
                                   origin, region, 0, nullptr, nullptr, nullptr);
 
         test_error(error, "clCommandFillImageKHR failed");
@@ -47,14 +48,34 @@
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size);
+        std::vector<cl_char> output_data_1(data_size);
         error = clEnqueueReadImage(queue, image, CL_TRUE, origin, region, 0, 0,
-                                   output_data.data(), 0, nullptr, nullptr);
+                                   output_data_1.data(), 0, nullptr, nullptr);
 
         for (size_t i = 0; i < data_size; i++)
         {
-            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern),
-                                     output_data[i], i);
+            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern_1),
+                                     output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error = clEnqueueFillImage(queue, image, fill_color_2, origin, region,
+                                   0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillImage failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size);
+        error = clEnqueueReadImage(queue, image, CL_TRUE, origin, region, 0, 0,
+                                   output_data_2.data(), 0, nullptr, nullptr);
+
+        for (size_t i = 0; i < data_size; i++)
+        {
+            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern_1),
+                                     output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -85,8 +106,12 @@
     const size_t data_size = img_width * img_height * 4 * sizeof(cl_char);
     const size_t origin[3] = { 0, 0, 0 },
                  region[3] = { img_width, img_height, 1 };
-    const cl_uint pattern = 0x10;
-    const cl_uint fill_color[4] = { pattern, pattern, pattern, pattern };
+    const cl_uint pattern_1 = 0x10;
+    const cl_uint fill_color_1[4] = { pattern_1, pattern_1, pattern_1,
+                                      pattern_1 };
+    const cl_uint pattern_2 = 0x20;
+    const cl_uint fill_color_2[4] = { pattern_2, pattern_2, pattern_2,
+                                      pattern_2 };
     const cl_image_format formats = { CL_RGBA, CL_UNSIGNED_INT8 };
 
     clMemWrapper image;
@@ -99,7 +124,7 @@
     cl_int Run() override
     {
         cl_int error = clCommandFillBufferKHR(
-            command_buffer, nullptr, in_mem, &pattern, sizeof(cl_char), 0,
+            command_buffer, nullptr, in_mem, &pattern_1, sizeof(cl_char), 0,
             data_size(), 0, nullptr, nullptr, nullptr);
 
         test_error(error, "clCommandFillBufferKHR failed");
@@ -111,22 +136,100 @@
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size());
+        std::vector<cl_char> output_data_1(data_size());
         error = clEnqueueReadBuffer(queue, in_mem, CL_TRUE, 0, data_size(),
-                                    output_data.data(), 0, nullptr, nullptr);
+                                    output_data_1.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < data_size(); i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        clEnqueueFillBuffer(queue, in_mem, &pattern_2, sizeof(cl_char), 0,
+                            data_size(), 0, nullptr, nullptr);
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size());
+        error = clEnqueueReadBuffer(queue, in_mem, CL_TRUE, 0, data_size(),
+                                    output_data_2.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < data_size(); i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
     }
 
-    const char pattern = 0x15;
+    const char pattern_1 = 0x15;
+    const char pattern_2 = 0x30;
 };
 
+struct FillSVMBufferKHR : public BasicSVMCommandBufferTest
+{
+    using BasicSVMCommandBufferTest::BasicSVMCommandBufferTest;
+
+    cl_int Run() override
+    {
+        cl_int error = clCommandSVMMemFillKHR(
+            command_buffer, nullptr, svm_in_mem(), &pattern_1, sizeof(cl_char),
+            data_size(), 0, nullptr, nullptr, nullptr);
+        test_error(error, "clCommandSVMMemFillKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_1(data_size());
+
+        error =
+            clEnqueueSVMMemcpy(queue, CL_TRUE, output_data_1.data(),
+                               svm_in_mem(), data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueSVMMemcpy failed");
+
+        for (size_t i = 0; i < data_size(); i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+        error = clEnqueueSVMMemFill(queue, svm_in_mem(), &pattern_2,
+                                    sizeof(cl_char), data_size(), 0, nullptr,
+                                    nullptr);
+        test_error(error, "clEnqueueSVMMemFill failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size());
+
+        error =
+            clEnqueueSVMMemcpy(queue, CL_TRUE, output_data_2.data(),
+                               svm_in_mem(), data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueSVMMemcpy failed");
+
+        for (size_t i = 0; i < data_size(); i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
+        }
+
+        return CL_SUCCESS;
+    }
+
+    const char pattern_1 = 0x15;
+    const char pattern_2 = 0x30;
+};
 };
 
 int test_fill_buffer(cl_device_id device, cl_context context,
@@ -135,6 +238,14 @@
     return MakeAndRunTest<FillBufferKHR>(device, context, queue, num_elements);
 }
 
+int test_fill_svm_buffer(cl_device_id device, cl_context context,
+                         cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<FillSVMBufferKHR>(device, context, queue,
+                                            num_elements);
+}
+
+
 int test_fill_image(cl_device_id device, cl_context context,
                     cl_command_queue queue, int num_elements)
 {
diff --git a/test_conformance/extensions/cl_khr_command_buffer/main.cpp b/test_conformance/extensions/cl_khr_command_buffer/main.cpp
index 4eefc8a..4ecb080 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/main.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/main.cpp
@@ -26,6 +26,7 @@
     ADD_TEST(info_ref_count),
     ADD_TEST(info_state),
     ADD_TEST(info_prop_array),
+    ADD_TEST(info_context),
     ADD_TEST(basic_profiling),
     ADD_TEST(simultaneous_profiling),
     ADD_TEST(regular_wait_for_command_buffer),
@@ -44,8 +45,10 @@
     ADD_TEST(simultaneous_queue_substitution),
     ADD_TEST(fill_image),
     ADD_TEST(fill_buffer),
+    ADD_TEST(fill_svm_buffer),
     ADD_TEST(copy_image),
     ADD_TEST(copy_buffer),
+    ADD_TEST(copy_svm_buffer),
     ADD_TEST(copy_buffer_to_image),
     ADD_TEST(copy_image_to_buffer),
     ADD_TEST(copy_buffer_rect),
@@ -58,7 +61,9 @@
     ADD_TEST(event_info_command_queue),
     ADD_TEST(event_info_execution_status),
     ADD_TEST(event_info_context),
-    ADD_TEST(event_info_reference_count)
+    ADD_TEST(event_info_reference_count),
+    ADD_TEST(finalize_invalid),
+    ADD_TEST(finalize_empty)
 };
 
 int main(int argc, const char *argv[])
diff --git a/test_conformance/extensions/cl_khr_command_buffer/procs.h b/test_conformance/extensions/cl_khr_command_buffer/procs.h
index 63e004a..ce121ce 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/procs.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/procs.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _CL_KHR_COMMAND_BUFFER_PROCS_H
-#define _CL_KHR_COMMAND_BUFFER_PROCS_H
+#ifndef CL_KHR_COMMAND_BUFFER_PROCS_H
+#define CL_KHR_COMMAND_BUFFER_PROCS_H
 
 #include <CL/cl.h>
 
@@ -41,6 +41,8 @@
                            cl_command_queue queue, int num_elements);
 extern int test_info_prop_array(cl_device_id device, cl_context context,
                                 cl_command_queue queue, int num_elements);
+extern int test_info_context(cl_device_id device, cl_context context,
+                             cl_command_queue queue, int num_elements);
 extern int test_basic_set_kernel_arg(cl_device_id device, cl_context context,
                                      cl_command_queue queue, int num_elements);
 extern int test_pending_set_kernel_arg(cl_device_id device, cl_context context,
@@ -101,10 +103,14 @@
                            cl_command_queue queue, int num_elements);
 extern int test_fill_buffer(cl_device_id device, cl_context context,
                             cl_command_queue queue, int num_elements);
+extern int test_fill_svm_buffer(cl_device_id device, cl_context context,
+                                cl_command_queue queue, int num_elements);
 extern int test_copy_image(cl_device_id device, cl_context context,
                            cl_command_queue queue, int num_elements);
 extern int test_copy_buffer(cl_device_id device, cl_context context,
                             cl_command_queue queue, int num_elements);
+extern int test_copy_svm_buffer(cl_device_id device, cl_context context,
+                                cl_command_queue queue, int num_elements);
 extern int test_copy_buffer_to_image(cl_device_id device, cl_context context,
                                      cl_command_queue queue, int num_elements);
 extern int test_copy_image_to_buffer(cl_device_id device, cl_context context,
@@ -130,5 +136,9 @@
                                            cl_context context,
                                            cl_command_queue queue,
                                            int num_elements);
+extern int test_finalize_invalid(cl_device_id device, cl_context context,
+                                 cl_command_queue queue, int num_elements);
+extern int test_finalize_empty(cl_device_id device, cl_context context,
+                               cl_command_queue queue, int num_elements);
 
-#endif /*_CL_KHR_COMMAND_BUFFER_PROCS_H*/
+#endif // CL_KHR_COMMAND_BUFFER_PROCS_H
diff --git a/test_conformance/extensions/cl_khr_command_buffer/svm_command_basic.cpp b/test_conformance/extensions/cl_khr_command_buffer/svm_command_basic.cpp
new file mode 100644
index 0000000..1fc48ce
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/svm_command_basic.cpp
@@ -0,0 +1,94 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "svm_command_basic.h"
+
+//--------------------------------------------------------------------------
+
+bool BasicSVMCommandBufferTest::Skip()
+{
+    if (BasicCommandBufferTest::Skip()) return true;
+
+    Version version = get_device_cl_version(device);
+    if (version < Version(2, 0))
+    {
+        log_info("test requires OpenCL 2.x/3.0 device");
+        return true;
+    }
+
+    cl_device_svm_capabilities svm_capabilities;
+    cl_int error =
+        clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES,
+                        sizeof(svm_capabilities), &svm_capabilities, NULL);
+    if (error != CL_SUCCESS)
+    {
+        print_error(error, "Unable to query CL_DEVICE_SVM_CAPABILITIES");
+        return true;
+    }
+
+    if (svm_capabilities == 0)
+    {
+        log_info("Device property CL_DEVICE_SVM_COARSE_GRAIN_BUFFER not "
+                 "supported \n");
+        return true;
+    }
+
+    if (init_extension_functions() != CL_SUCCESS)
+    {
+        log_error("Unable to initialise extension functions");
+        return true;
+    }
+
+    return false;
+}
+
+//--------------------------------------------------------------------------
+
+cl_int BasicSVMCommandBufferTest::SetUpKernelArgs(void)
+{
+    size_t size = sizeof(cl_int) * num_elements * buffer_size_multiplier;
+    svm_in_mem = clSVMWrapper(context, size);
+    if (svm_in_mem() == nullptr)
+    {
+        log_error("Unable to allocate SVM memory");
+        return CL_OUT_OF_RESOURCES;
+    }
+    svm_out_mem = clSVMWrapper(context, size);
+    if (svm_out_mem() == nullptr)
+    {
+        log_error("Unable to allocate SVM memory");
+        return CL_OUT_OF_RESOURCES;
+    }
+    return CL_SUCCESS;
+}
+
+//--------------------------------------------------------------------------
+
+cl_int BasicSVMCommandBufferTest::init_extension_functions()
+{
+    cl_int error = BasicCommandBufferTest::init_extension_functions();
+    test_error(error, "Unable to initialise extension functions");
+
+    cl_platform_id platform;
+    error = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
+                            &platform, nullptr);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_PLATFORM failed");
+
+    GET_EXTENSION_ADDRESS(clCommandSVMMemFillKHR);
+    GET_EXTENSION_ADDRESS(clCommandSVMMemcpyKHR);
+
+    return CL_SUCCESS;
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/svm_command_basic.h b/test_conformance/extensions/cl_khr_command_buffer/svm_command_basic.h
new file mode 100644
index 0000000..f6b6b42
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/svm_command_basic.h
@@ -0,0 +1,42 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#ifndef CL_KHR_SVM_COMMAND_BASIC_H
+#define CL_KHR_SVM_COMMAND_BASIC_H
+
+#include "basic_command_buffer.h"
+
+
+struct BasicSVMCommandBufferTest : BasicCommandBufferTest
+{
+    BasicSVMCommandBufferTest(cl_device_id device, cl_context context,
+                              cl_command_queue queue)
+        : BasicCommandBufferTest(device, context, queue)
+    {}
+
+    virtual bool Skip() override;
+    virtual cl_int SetUpKernelArgs(void) override;
+
+protected:
+    cl_int init_extension_functions();
+
+    clCommandSVMMemFillKHR_fn clCommandSVMMemFillKHR = nullptr;
+    clCommandSVMMemcpyKHR_fn clCommandSVMMemcpyKHR = nullptr;
+
+    clSVMWrapper svm_in_mem, svm_out_mem;
+};
+
+#endif
diff --git a/test_conformance/extensions/cl_khr_external_semaphore/procs.h b/test_conformance/extensions/cl_khr_external_semaphore/procs.h
index 753c8fe..7e1c4ca 100644
--- a/test_conformance/extensions/cl_khr_external_semaphore/procs.h
+++ b/test_conformance/extensions/cl_khr_external_semaphore/procs.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _CL_KHR_EXTERNAL_SEMAPHORE_PROCS_H
-#define _CL_KHR_EXTERNAL_SEMAPHORE_PROCS_H
+#ifndef CL_KHR_EXTERNAL_SEMAPHORE_PROCS_H
+#define CL_KHR_EXTERNAL_SEMAPHORE_PROCS_H
 
 #include <CL/cl.h>
 
@@ -79,4 +79,4 @@
                                                     cl_context context,
                                                     cl_command_queue queue,
                                                     int num_elements);
-#endif /* CL_KHR_EXTERNAL_SEMAPHORE */
+#endif // CL_KHR_EXTERNAL_SEMAPHORE_PROCS_H
diff --git a/test_conformance/extensions/cl_khr_external_semaphore/test_external_semaphore.cpp b/test_conformance/extensions/cl_khr_external_semaphore/test_external_semaphore.cpp
index a7ed307..89ab17b 100644
--- a/test_conformance/extensions/cl_khr_external_semaphore/test_external_semaphore.cpp
+++ b/test_conformance/extensions/cl_khr_external_semaphore/test_external_semaphore.cpp
@@ -120,9 +120,11 @@
     SEMAPHORE_PARAM_TEST(CL_SEMAPHORE_TYPE_KHR, cl_semaphore_type_khr,
                          CL_SEMAPHORE_TYPE_BINARY_KHR);
 
-    SEMAPHORE_PARAM_TEST(CL_DEVICE_HANDLE_LIST_KHR, cl_uint, 1);
+    SEMAPHORE_PARAM_TEST(CL_DEVICE_HANDLE_LIST_KHR, cl_device_id, deviceID);
 
-    SEMAPHORE_PARAM_TEST(CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR, cl_uint, 1);
+    SEMAPHORE_PARAM_TEST(
+        CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR, cl_uint,
+        getCLSemaphoreTypeFromVulkanType(vkExternalSemaphoreHandleType));
 
     // Confirm that querying CL_SEMAPHORE_CONTEXT_KHR returns the right context
     SEMAPHORE_PARAM_TEST(CL_SEMAPHORE_CONTEXT_KHR, cl_context, context);
@@ -290,7 +292,7 @@
                                      nullptr, 0, nullptr, &wait_event);
     test_error(err, "Could not wait semaphore");
 
-    // Finish queue_1 and queue_2
+    // Finish queue_1 and queue_2
     err = clFinish(queue_1);
     test_error(err, "Could not finish queue");
 
@@ -304,7 +306,7 @@
     return TEST_PASS;
 }
 
-// Confirm that a signal followed by a wait will complete successfully
+// Confirm that a signal followed by a wait will complete successfully
 int test_external_semaphores_simple_1(cl_device_id deviceID, cl_context context,
                                       cl_command_queue defaultQueue,
                                       int num_elements)
@@ -931,420 +933,3 @@
 
     return TEST_PASS;
 }
-
-// Confirm that it is possible to enqueue a signal of wait and signal in any
-// order as soon as the submission order (after deferred dependencies) is
-// correct. Case: first one deferred wait, then one non deferred signal.
-int test_external_semaphores_order_1(cl_device_id deviceID, cl_context context,
-                                     cl_command_queue defaultQueue,
-                                     int num_elements)
-{
-    if (!is_extension_available(deviceID, "cl_khr_external_semaphore"))
-    {
-        log_info("cl_khr_semaphore is not supported on this platoform. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    if (init_vuikan_device())
-    {
-        log_info("Cannot initialise Vulkan. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    VulkanDevice vkDevice;
-
-    // Obtain pointers to semaphore's API
-    GET_PFN(deviceID, clEnqueueSignalSemaphoresKHR);
-    GET_PFN(deviceID, clEnqueueWaitSemaphoresKHR);
-
-    const std::vector<VulkanExternalMemoryHandleType>
-        vkExternalMemoryHandleTypeList =
-            getSupportedVulkanExternalMemoryHandleTypeList();
-    VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType =
-        getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
-    VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
-
-    clExternalSemaphore sema_ext(vkVk2CLSemaphore, context,
-                                 vkExternalSemaphoreHandleType, deviceID);
-
-    cl_int err = CL_SUCCESS;
-
-    // Create ooo queue
-    clCommandQueueWrapper queue = clCreateCommandQueue(
-        context, deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
-    test_error(err, "Could not create command queue");
-
-    // Create user event
-    clEventWrapper user_event = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    // Wait semaphore (dependency on user_event)
-    clEventWrapper wait_event;
-    err = clEnqueueWaitSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                     nullptr, 1, &user_event, &wait_event);
-    test_error(err, "Could not wait semaphore");
-
-    // Signal semaphore
-    clEventWrapper signal_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                       nullptr, 0, nullptr, &signal_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Flush and delay
-    err = clFlush(queue);
-    test_error(err, "Could not flush queue");
-    std::this_thread::sleep_for(std::chrono::seconds(FLUSH_DELAY_S));
-
-    // Ensure signal event is completed while wait event is not
-    test_assert_event_complete(signal_event);
-    test_assert_event_inprogress(wait_event);
-
-    // Complete user_event
-    err = clSetUserEventStatus(user_event, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Finish
-    err = clFinish(queue);
-    test_error(err, "Could not finish queue");
-
-    // Ensure all events are completed
-    test_assert_event_complete(signal_event);
-    test_assert_event_complete(wait_event);
-
-    return TEST_PASS;
-}
-
-// Confirm that it is possible to enqueue a signal of wait and signal in any
-// order as soon as the submission order (after deferred dependencies) is
-// correct. Case: first two deferred signals, then one deferred wait. Unblock
-// signal, then unblock wait. When wait completes, unblock the other signal.
-int test_external_semaphores_order_2(cl_device_id deviceID, cl_context context,
-                                     cl_command_queue defaultQueue,
-                                     int num_elements)
-{
-    if (!is_extension_available(deviceID, "cl_khr_external_semaphore"))
-    {
-        log_info("cl_khr_semaphore is not supported on this platoform. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    if (init_vuikan_device())
-    {
-        log_info("Cannot initialise Vulkan. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    VulkanDevice vkDevice;
-
-    // Obtain pointers to semaphore's API
-    GET_PFN(deviceID, clEnqueueSignalSemaphoresKHR);
-    GET_PFN(deviceID, clEnqueueWaitSemaphoresKHR);
-
-    const std::vector<VulkanExternalMemoryHandleType>
-        vkExternalMemoryHandleTypeList =
-            getSupportedVulkanExternalMemoryHandleTypeList();
-    VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType =
-        getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
-    VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
-
-    clExternalSemaphore sema_ext(vkVk2CLSemaphore, context,
-                                 vkExternalSemaphoreHandleType, deviceID);
-
-    cl_int err = CL_SUCCESS;
-
-    // Create ooo queue
-    clCommandQueueWrapper queue = clCreateCommandQueue(
-        context, deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
-    test_error(err, "Could not create command queue");
-
-    // Create user events
-    clEventWrapper user_event_1 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    clEventWrapper user_event_2 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    clEventWrapper user_event_3 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    // Signal semaphore (dependency on user_event_1)
-    clEventWrapper signal_1_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                       nullptr, 1, &user_event_1,
-                                       &signal_1_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Signal semaphore (dependency on user_event_2)
-    clEventWrapper signal_2_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                       nullptr, 1, &user_event_2,
-                                       &signal_2_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Wait semaphore (dependency on user_event_3)
-    clEventWrapper wait_event;
-    err = clEnqueueWaitSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                     nullptr, 1, &user_event_3, &wait_event);
-    test_error(err, "Could not wait semaphore");
-
-    // Complete user_event_1
-    err = clSetUserEventStatus(user_event_1, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Complete user_event_3
-    err = clSetUserEventStatus(user_event_3, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Flush and delay
-    err = clFlush(queue);
-    test_error(err, "Could not flush queue");
-    std::this_thread::sleep_for(std::chrono::seconds(FLUSH_DELAY_S));
-
-    // Ensure all events are completed except for second signal
-    test_assert_event_complete(signal_1_event);
-    test_assert_event_inprogress(signal_2_event);
-    test_assert_event_complete(wait_event);
-
-    // Complete user_event_2
-    err = clSetUserEventStatus(user_event_2, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Finish
-    err = clFinish(queue);
-    test_error(err, "Could not finish queue");
-
-    // Ensure all events are completed
-    test_assert_event_complete(signal_1_event);
-    test_assert_event_complete(signal_2_event);
-    test_assert_event_complete(wait_event);
-
-    return TEST_PASS;
-}
-
-// Confirm that it is possible to enqueue a signal of wait and signal in any
-// order as soon as the submission order (after deferred dependencies) is
-// correct. Case: first two deferred signals, then two deferred waits. Unblock
-// one signal and one wait (both blocked by the same user event). When wait
-// completes, unblock the other signal. Then unblock the other wait.
-int test_external_semaphores_order_3(cl_device_id deviceID, cl_context context,
-                                     cl_command_queue defaultQueue,
-                                     int num_elements)
-{
-    if (!is_extension_available(deviceID, "cl_khr_external_semaphore"))
-    {
-        log_info("cl_khr_semaphore is not supported on this platoform. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    if (init_vuikan_device())
-    {
-        log_info("Cannot initialise Vulkan. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    VulkanDevice vkDevice;
-
-    // Obtain pointers to semaphore's API
-    GET_PFN(deviceID, clEnqueueSignalSemaphoresKHR);
-    GET_PFN(deviceID, clEnqueueWaitSemaphoresKHR);
-
-    const std::vector<VulkanExternalMemoryHandleType>
-        vkExternalMemoryHandleTypeList =
-            getSupportedVulkanExternalMemoryHandleTypeList();
-    VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType =
-        getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
-    VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
-
-    clExternalSemaphore sema_ext(vkVk2CLSemaphore, context,
-                                 vkExternalSemaphoreHandleType, deviceID);
-
-    cl_int err = CL_SUCCESS;
-
-    // Create ooo queue
-    clCommandQueueWrapper queue = clCreateCommandQueue(
-        context, deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
-    test_error(err, "Could not create command queue");
-
-    // Create user events
-    clEventWrapper user_event_1 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    clEventWrapper user_event_2 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    clEventWrapper user_event_3 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    // Signal semaphore (dependency on user_event_1)
-    clEventWrapper signal_1_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                       nullptr, 1, &user_event_1,
-                                       &signal_1_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Signal semaphore (dependency on user_event_2)
-    clEventWrapper signal_2_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                       nullptr, 1, &user_event_2,
-                                       &signal_2_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Wait semaphore (dependency on user_event_3)
-    clEventWrapper wait_1_event;
-    err = clEnqueueWaitSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                     nullptr, 1, &user_event_3, &wait_1_event);
-    test_error(err, "Could not wait semaphore");
-
-    // Wait semaphore (dependency on user_event_2)
-    clEventWrapper wait_2_event;
-    err = clEnqueueWaitSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                     nullptr, 1, &user_event_2, &wait_2_event);
-    test_error(err, "Could not wait semaphore");
-
-    // Complete user_event_2
-    err = clSetUserEventStatus(user_event_2, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Flush and delay
-    err = clFlush(queue);
-    test_error(err, "Could not flush queue");
-    std::this_thread::sleep_for(std::chrono::seconds(FLUSH_DELAY_S));
-
-    // Ensure only second signal and second wait completed
-    cl_event event_list[] = { signal_2_event, wait_2_event };
-    err = clWaitForEvents(2, event_list);
-    test_error(err, "Could not wait for events");
-
-    test_assert_event_inprogress(signal_1_event);
-    test_assert_event_inprogress(wait_1_event);
-
-    // Complete user_event_1
-    err = clSetUserEventStatus(user_event_1, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Complete user_event_3
-    err = clSetUserEventStatus(user_event_3, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Finish
-    err = clFinish(queue);
-    test_error(err, "Could not finish queue");
-
-    // Ensure all events are completed
-    test_assert_event_complete(signal_1_event);
-    test_assert_event_complete(signal_2_event);
-    test_assert_event_complete(wait_1_event);
-    test_assert_event_complete(wait_2_event);
-
-    return TEST_PASS;
-}
-
-// Test that an invalid semaphore command results in the invalidation of the
-// command's event and the dependencies' events
-int test_external_semaphores_invalid_command(cl_device_id deviceID,
-                                             cl_context context,
-                                             cl_command_queue defaultQueue,
-                                             int num_elements)
-{
-    if (!is_extension_available(deviceID, "cl_khr_external_semaphore"))
-    {
-        log_info("cl_khr_semaphore is not supported on this platoform. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    if (init_vuikan_device())
-    {
-        log_info("Cannot initialise Vulkan. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    VulkanDevice vkDevice;
-
-    // Obtain pointers to semaphore's API
-    GET_PFN(deviceID, clEnqueueSignalSemaphoresKHR);
-    GET_PFN(deviceID, clEnqueueWaitSemaphoresKHR);
-
-    const std::vector<VulkanExternalMemoryHandleType>
-        vkExternalMemoryHandleTypeList =
-            getSupportedVulkanExternalMemoryHandleTypeList();
-    VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType =
-        getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
-    VulkanSemaphore vkVk2CLSemaphore1(vkDevice, vkExternalSemaphoreHandleType);
-    VulkanSemaphore vkVk2CLSemaphore2(vkDevice, vkExternalSemaphoreHandleType);
-
-    clExternalSemaphore sema_ext_1(vkVk2CLSemaphore1, context,
-                                   vkExternalSemaphoreHandleType, deviceID);
-    clExternalSemaphore sema_ext_2(vkVk2CLSemaphore2, context,
-                                   vkExternalSemaphoreHandleType, deviceID);
-
-    cl_int err = CL_SUCCESS;
-
-    // Create ooo queue
-    clCommandQueueWrapper queue = clCreateCommandQueue(
-        context, deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
-    test_error(err, "Could not create command queue");
-
-    // Create user events
-    clEventWrapper user_event_1 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    clEventWrapper user_event_2 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    // Signal semaphore_1 (dependency on user_event_1)
-    clEventWrapper signal_1_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema_ext_1.getCLSemaphore(),
-                                       nullptr, 1, &user_event_1,
-                                       &signal_1_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Wait semaphore_1 and semaphore_2 (dependency on user_event_1)
-    clEventWrapper wait_event;
-    cl_semaphore_khr sema_list[] = { sema_ext_1.getCLSemaphore(),
-                                     sema_ext_2.getCLSemaphore() };
-    err = clEnqueueWaitSemaphoresKHR(queue, 2, sema_list, nullptr, 1,
-                                     &user_event_1, &wait_event);
-    test_error(err, "Could not wait semaphore");
-
-    // Signal semaphore_1 (dependency on wait_event and user_event_2)
-    clEventWrapper signal_2_event;
-    cl_event wait_list[] = { user_event_2, wait_event };
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema_ext_1.getCLSemaphore(),
-                                       nullptr, 2, wait_list, &signal_2_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Flush and delay
-    err = clFlush(queue);
-    test_error(err, "Could not flush queue");
-    std::this_thread::sleep_for(std::chrono::seconds(FLUSH_DELAY_S));
-
-    // Ensure all events are not completed
-    test_assert_event_inprogress(signal_1_event);
-    test_assert_event_inprogress(signal_2_event);
-    test_assert_event_inprogress(wait_event);
-
-    // Complete user_event_1 (expect failure as waiting on semaphore_2 is not
-    // allowed (unsignaled)
-    err = clSetUserEventStatus(user_event_1, CL_COMPLETE);
-    test_assert_error(err != CL_SUCCESS,
-                      "signal_2_event completed unexpectedly");
-
-    // Ensure signal_1 is completed while others failed (the second signal
-    // should fail as it depends on wait)
-    err = clFinish(queue);
-    test_error(err, "Could not finish queue");
-
-    test_assert_event_complete(signal_1_event);
-    test_assert_event_terminated(wait_event);
-    test_assert_event_terminated(signal_2_event);
-
-    return TEST_PASS;
-}
diff --git a/test_conformance/extensions/cl_khr_semaphore/main.cpp b/test_conformance/extensions/cl_khr_semaphore/main.cpp
index ab9699b..0ae7206 100644
--- a/test_conformance/extensions/cl_khr_semaphore/main.cpp
+++ b/test_conformance/extensions/cl_khr_semaphore/main.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2023 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -34,11 +34,7 @@
     ADD_TEST_VERSION(semaphores_multi_signal, Version(1, 2)),
     ADD_TEST_VERSION(semaphores_multi_wait, Version(1, 2)),
     ADD_TEST_VERSION(semaphores_queries, Version(1, 2)),
-    ADD_TEST_VERSION(semaphores_order_1, Version(1, 2)),
-    ADD_TEST_VERSION(semaphores_order_2, Version(1, 2)),
-    ADD_TEST_VERSION(semaphores_order_3, Version(1, 2)),
     ADD_TEST_VERSION(semaphores_import_export_fd, Version(1, 2)),
-    ADD_TEST_VERSION(semaphores_invalid_command, Version(1, 2)),
 };
 
 const int test_num = ARRAY_SIZE(test_list);
diff --git a/test_conformance/extensions/cl_khr_semaphore/procs.h b/test_conformance/extensions/cl_khr_semaphore/procs.h
index 06651af..f7c1aaa 100644
--- a/test_conformance/extensions/cl_khr_semaphore/procs.h
+++ b/test_conformance/extensions/cl_khr_semaphore/procs.h
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2023 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -41,17 +41,7 @@
                                       cl_command_queue queue, int num_elements);
 extern int test_semaphores_queries(cl_device_id deviceID, cl_context context,
                                    cl_command_queue queue, int num_elements);
-extern int test_semaphores_order_1(cl_device_id deviceID, cl_context context,
-                                   cl_command_queue queue, int num_elements);
-extern int test_semaphores_order_2(cl_device_id deviceID, cl_context context,
-                                   cl_command_queue queue, int num_elements);
-extern int test_semaphores_order_3(cl_device_id deviceID, cl_context context,
-                                   cl_command_queue queue, int num_elements);
 extern int test_semaphores_import_export_fd(cl_device_id deviceID,
                                             cl_context context,
                                             cl_command_queue queue,
                                             int num_elements);
-extern int test_semaphores_invalid_command(cl_device_id deviceID,
-                                           cl_context context,
-                                           cl_command_queue queue,
-                                           int num_elements);
diff --git a/test_conformance/extensions/cl_khr_semaphore/test_semaphores.cpp b/test_conformance/extensions/cl_khr_semaphore/test_semaphores.cpp
index 7d03bff..36bb8ad 100644
--- a/test_conformance/extensions/cl_khr_semaphore/test_semaphores.cpp
+++ b/test_conformance/extensions/cl_khr_semaphore/test_semaphores.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2022 The Khronos Group Inc.
+// Copyright (c) 2023 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -646,303 +646,6 @@
     return TEST_PASS;
 }
 
-// Confirm that it is possible to enqueue a signal of wait and signal in any
-// order as soon as the submission order (after deferred dependencies) is
-// correct. Case: first one deferred wait, then one non deferred signal.
-int test_semaphores_order_1(cl_device_id deviceID, cl_context context,
-                            cl_command_queue defaultQueue, int num_elements)
-{
-    cl_int err;
-
-    if (!is_extension_available(deviceID, "cl_khr_semaphore"))
-    {
-        log_info("cl_khr_semaphore is not supported on this platoform. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    // Obtain pointers to semaphore's API
-    GET_PFN(deviceID, clCreateSemaphoreWithPropertiesKHR);
-    GET_PFN(deviceID, clEnqueueSignalSemaphoresKHR);
-    GET_PFN(deviceID, clEnqueueWaitSemaphoresKHR);
-    GET_PFN(deviceID, clReleaseSemaphoreKHR);
-
-    // Create ooo queue
-    clCommandQueueWrapper queue = clCreateCommandQueue(
-        context, deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
-    test_error(err, "Could not create command queue");
-
-    // Create semaphore
-    cl_semaphore_properties_khr sema_props[] = {
-        static_cast<cl_semaphore_properties_khr>(CL_SEMAPHORE_TYPE_KHR),
-        static_cast<cl_semaphore_properties_khr>(CL_SEMAPHORE_TYPE_BINARY_KHR),
-        0
-    };
-    cl_semaphore_khr sema =
-        clCreateSemaphoreWithPropertiesKHR(context, sema_props, &err);
-    test_error(err, "Could not create semaphore");
-
-    // Create user event
-    clEventWrapper user_event = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    // Wait semaphore (dependency on user_event)
-    clEventWrapper wait_event;
-    err = clEnqueueWaitSemaphoresKHR(queue, 1, &sema, nullptr, 1, &user_event,
-                                     &wait_event);
-    test_error(err, "Could not wait semaphore");
-
-    // Signal semaphore
-    clEventWrapper signal_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema, nullptr, 0, nullptr,
-                                       &signal_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Flush and delay
-    err = clFlush(queue);
-    test_error(err, "Could not flush queue");
-    std::this_thread::sleep_for(std::chrono::seconds(FLUSH_DELAY_S));
-
-    // Ensure signal event is completed while wait event is not
-    test_assert_event_complete(signal_event);
-    test_assert_event_inprogress(wait_event);
-
-    // Complete user_event
-    err = clSetUserEventStatus(user_event, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Finish
-    err = clFinish(queue);
-    test_error(err, "Could not finish queue");
-
-    // Ensure all events are completed
-    test_assert_event_complete(signal_event);
-    test_assert_event_complete(wait_event);
-
-    // Release semaphore
-    err = clReleaseSemaphoreKHR(sema);
-    test_error(err, "Could not release semaphore");
-
-    return TEST_PASS;
-}
-
-// Confirm that it is possible to enqueue a signal of wait and signal in any
-// order as soon as the submission order (after deferred dependencies) is
-// correct. Case: first two deferred signals, then one deferred wait. Unblock
-// signal, then unblock wait. When wait completes, unblock the other signal.
-int test_semaphores_order_2(cl_device_id deviceID, cl_context context,
-                            cl_command_queue defaultQueue, int num_elements)
-{
-    cl_int err;
-
-    if (!is_extension_available(deviceID, "cl_khr_semaphore"))
-    {
-        log_info("cl_khr_semaphore is not supported on this platoform. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    // Obtain pointers to semaphore's API
-    GET_PFN(deviceID, clCreateSemaphoreWithPropertiesKHR);
-    GET_PFN(deviceID, clEnqueueSignalSemaphoresKHR);
-    GET_PFN(deviceID, clEnqueueWaitSemaphoresKHR);
-    GET_PFN(deviceID, clReleaseSemaphoreKHR);
-
-    // Create ooo queue
-    clCommandQueueWrapper queue = clCreateCommandQueue(
-        context, deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
-    test_error(err, "Could not create command queue");
-
-    // Create semaphore
-    cl_semaphore_properties_khr sema_props[] = {
-        static_cast<cl_semaphore_properties_khr>(CL_SEMAPHORE_TYPE_KHR),
-        static_cast<cl_semaphore_properties_khr>(CL_SEMAPHORE_TYPE_BINARY_KHR),
-        0
-    };
-    cl_semaphore_khr sema =
-        clCreateSemaphoreWithPropertiesKHR(context, sema_props, &err);
-    test_error(err, "Could not create semaphore");
-
-    // Create user events
-    clEventWrapper user_event_1 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    clEventWrapper user_event_2 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    clEventWrapper user_event_3 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    // Signal semaphore (dependency on user_event_1)
-    clEventWrapper signal_1_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema, nullptr, 1,
-                                       &user_event_1, &signal_1_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Signal semaphore (dependency on user_event_2)
-    clEventWrapper signal_2_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema, nullptr, 1,
-                                       &user_event_2, &signal_2_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Wait semaphore (dependency on user_event_3)
-    clEventWrapper wait_event;
-    err = clEnqueueWaitSemaphoresKHR(queue, 1, &sema, nullptr, 1, &user_event_3,
-                                     &wait_event);
-    test_error(err, "Could not wait semaphore");
-
-    // Complete user_event_1
-    err = clSetUserEventStatus(user_event_1, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Complete user_event_3
-    err = clSetUserEventStatus(user_event_3, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Flush and delay
-    err = clFlush(queue);
-    test_error(err, "Could not flush queue");
-    std::this_thread::sleep_for(std::chrono::seconds(FLUSH_DELAY_S));
-
-    // Ensure all events are completed except for second signal
-    test_assert_event_complete(signal_1_event);
-    test_assert_event_inprogress(signal_2_event);
-    test_assert_event_complete(wait_event);
-
-    // Complete user_event_2
-    err = clSetUserEventStatus(user_event_2, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Finish
-    err = clFinish(queue);
-    test_error(err, "Could not finish queue");
-
-    // Ensure all events are completed
-    test_assert_event_complete(signal_1_event);
-    test_assert_event_complete(signal_2_event);
-    test_assert_event_complete(wait_event);
-
-    // Release semaphore
-    err = clReleaseSemaphoreKHR(sema);
-    test_error(err, "Could not release semaphore");
-
-    return TEST_PASS;
-}
-
-// Confirm that it is possible to enqueue a signal of wait and signal in any
-// order as soon as the submission order (after deferred dependencies) is
-// correct. Case: first two deferred signals, then two deferred waits. Unblock
-// one signal and one wait (both blocked by the same user event). When wait
-// completes, unblock the other signal. Then unblock the other wait.
-int test_semaphores_order_3(cl_device_id deviceID, cl_context context,
-                            cl_command_queue defaultQueue, int num_elements)
-{
-    cl_int err;
-
-    if (!is_extension_available(deviceID, "cl_khr_semaphore"))
-    {
-        log_info("cl_khr_semaphore is not supported on this platoform. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    // Obtain pointers to semaphore's API
-    GET_PFN(deviceID, clCreateSemaphoreWithPropertiesKHR);
-    GET_PFN(deviceID, clEnqueueSignalSemaphoresKHR);
-    GET_PFN(deviceID, clEnqueueWaitSemaphoresKHR);
-    GET_PFN(deviceID, clReleaseSemaphoreKHR);
-
-    // Create ooo queue
-    clCommandQueueWrapper queue = clCreateCommandQueue(
-        context, deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
-    test_error(err, "Could not create command queue");
-
-    // Create semaphore
-    cl_semaphore_properties_khr sema_props[] = {
-        static_cast<cl_semaphore_properties_khr>(CL_SEMAPHORE_TYPE_KHR),
-        static_cast<cl_semaphore_properties_khr>(CL_SEMAPHORE_TYPE_BINARY_KHR),
-        0
-    };
-    cl_semaphore_khr sema =
-        clCreateSemaphoreWithPropertiesKHR(context, sema_props, &err);
-    test_error(err, "Could not create semaphore");
-
-    // Create user events
-    clEventWrapper user_event_1 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    clEventWrapper user_event_2 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    clEventWrapper user_event_3 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    // Signal semaphore (dependency on user_event_1)
-    clEventWrapper signal_1_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema, nullptr, 1,
-                                       &user_event_1, &signal_1_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Signal semaphore (dependency on user_event_2)
-    clEventWrapper signal_2_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema, nullptr, 1,
-                                       &user_event_2, &signal_2_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Wait semaphore (dependency on user_event_3)
-    clEventWrapper wait_1_event;
-    err = clEnqueueWaitSemaphoresKHR(queue, 1, &sema, nullptr, 1, &user_event_3,
-                                     &wait_1_event);
-    test_error(err, "Could not wait semaphore");
-
-    // Wait semaphore (dependency on user_event_2)
-    clEventWrapper wait_2_event;
-    err = clEnqueueWaitSemaphoresKHR(queue, 1, &sema, nullptr, 1, &user_event_2,
-                                     &wait_2_event);
-    test_error(err, "Could not wait semaphore");
-
-    // Complete user_event_2
-    err = clSetUserEventStatus(user_event_2, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Flush and delay
-    err = clFlush(queue);
-    test_error(err, "Could not flush queue");
-    std::this_thread::sleep_for(std::chrono::seconds(FLUSH_DELAY_S));
-
-    // Ensure only second signal and second wait completed
-    cl_event event_list[] = { signal_2_event, wait_2_event };
-    err = clWaitForEvents(2, event_list);
-    test_error(err, "Could not wait for events");
-
-    test_assert_event_inprogress(signal_1_event);
-    test_assert_event_inprogress(wait_1_event);
-
-    // Complete user_event_1
-    err = clSetUserEventStatus(user_event_1, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Complete user_event_3
-    err = clSetUserEventStatus(user_event_3, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Finish
-    err = clFinish(queue);
-    test_error(err, "Could not finish queue");
-
-    // Ensure all events are completed
-    test_assert_event_complete(signal_1_event);
-    test_assert_event_complete(signal_2_event);
-    test_assert_event_complete(wait_1_event);
-    test_assert_event_complete(wait_2_event);
-
-    // Release semaphore
-    err = clReleaseSemaphoreKHR(sema);
-    test_error(err, "Could not release semaphore");
-
-    return TEST_PASS;
-}
-
 // Test it is possible to export a semaphore to a sync fd and import the same
 // sync fd to a new semaphore
 int test_semaphores_import_export_fd(cl_device_id deviceID, cl_context context,
@@ -985,6 +688,8 @@
             CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR),
         static_cast<cl_semaphore_properties_khr>(
             CL_SEMAPHORE_HANDLE_SYNC_FD_KHR),
+        static_cast<cl_semaphore_properties_khr>(
+            CL_SEMAPHORE_EXPORT_HANDLE_TYPES_LIST_END_KHR),
         0
     };
     cl_semaphore_khr sema_1 =
@@ -1040,106 +745,4 @@
     err = clReleaseSemaphoreKHR(sema_2);
     test_error(err, "Could not release semaphore");
     return TEST_PASS;
-}
-
-// Test that an invalid semaphore command results in the invalidation of the
-// command's event and the dependencies' events
-int test_semaphores_invalid_command(cl_device_id deviceID, cl_context context,
-                                    cl_command_queue defaultQueue,
-                                    int num_elements)
-{
-    cl_int err;
-
-    if (!is_extension_available(deviceID, "cl_khr_semaphore"))
-    {
-        log_info("cl_khr_semaphore is not supported on this platoform. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    // Obtain pointers to semaphore's API
-    GET_PFN(deviceID, clCreateSemaphoreWithPropertiesKHR);
-    GET_PFN(deviceID, clEnqueueSignalSemaphoresKHR);
-    GET_PFN(deviceID, clEnqueueWaitSemaphoresKHR);
-    GET_PFN(deviceID, clReleaseSemaphoreKHR);
-
-    // Create ooo queue
-    clCommandQueueWrapper queue = clCreateCommandQueue(
-        context, deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
-    test_error(err, "Could not create command queue");
-
-    // Create semaphores
-    cl_semaphore_properties_khr sema_props[] = {
-        static_cast<cl_semaphore_properties_khr>(CL_SEMAPHORE_TYPE_KHR),
-        static_cast<cl_semaphore_properties_khr>(CL_SEMAPHORE_TYPE_BINARY_KHR),
-        0
-    };
-    cl_semaphore_khr sema_1 =
-        clCreateSemaphoreWithPropertiesKHR(context, sema_props, &err);
-    test_error(err, "Could not create semaphore");
-
-    cl_semaphore_khr sema_2 =
-        clCreateSemaphoreWithPropertiesKHR(context, sema_props, &err);
-    test_error(err, "Could not create semaphore");
-
-    // Create user events
-    clEventWrapper user_event_1 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    clEventWrapper user_event_2 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    // Signal semaphore_1 (dependency on user_event_1)
-    clEventWrapper signal_1_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema_1, nullptr, 1,
-                                       &user_event_1, &signal_1_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Wait semaphore_1 and semaphore_2 (dependency on user_event_1)
-    clEventWrapper wait_event;
-    cl_semaphore_khr sema_list[] = { sema_1, sema_2 };
-    err = clEnqueueWaitSemaphoresKHR(queue, 2, sema_list, nullptr, 1,
-                                     &user_event_1, &wait_event);
-    test_error(err, "Could not wait semaphore");
-
-    // Signal semaphore_1 (dependency on wait_event and user_event_2)
-    clEventWrapper signal_2_event;
-    cl_event wait_list[] = { user_event_2, wait_event };
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema_1, nullptr, 2, wait_list,
-                                       &signal_2_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Flush and delay
-    err = clFlush(queue);
-    test_error(err, "Could not flush queue");
-    std::this_thread::sleep_for(std::chrono::seconds(FLUSH_DELAY_S));
-
-    // Ensure all events are not completed
-    test_assert_event_inprogress(signal_1_event);
-    test_assert_event_inprogress(signal_2_event);
-    test_assert_event_inprogress(wait_event);
-
-    // Complete user_event_1 (expect failure as waiting on semaphore_2 is not
-    // allowed (unsignaled)
-    err = clSetUserEventStatus(user_event_1, CL_COMPLETE);
-    test_assert_error(err != CL_SUCCESS,
-                      "signal_2_event completed unexpectedly");
-
-    // Ensure signal_1 is completed while others failed (the second signal
-    // should fail as it depends on wait)
-    err = clFinish(queue);
-    test_error(err, "Could not finish queue");
-
-    test_assert_event_complete(signal_1_event);
-    test_assert_event_terminated(wait_event);
-    test_assert_event_terminated(signal_2_event);
-
-    // Release semaphore
-    err = clReleaseSemaphoreKHR(sema_1);
-    test_error(err, "Could not release semaphore");
-
-    err = clReleaseSemaphoreKHR(sema_2);
-    test_error(err, "Could not release semaphore");
-
-    return TEST_PASS;
 }
\ No newline at end of file
diff --git a/test_conformance/geometrics/CMakeLists.txt b/test_conformance/geometrics/CMakeLists.txt
index 3fee05f..8a6f25c 100644
--- a/test_conformance/geometrics/CMakeLists.txt
+++ b/test_conformance/geometrics/CMakeLists.txt
@@ -6,5 +6,7 @@
     test_geometrics.cpp
 )
 
+set_gnulike_module_compile_flags("-Wno-sign-compare")
+
 include(../CMakeCommon.txt)
 
diff --git a/test_conformance/gl/test_images_write_common.cpp b/test_conformance/gl/test_images_write_common.cpp
index 4d72129..69d00a1 100644
--- a/test_conformance/gl/test_images_write_common.cpp
+++ b/test_conformance/gl/test_images_write_common.cpp
@@ -571,6 +571,7 @@
                       "%s (%s):%d",
                       GetGLTargetName(target), __FUNCTION__, __FILE__,
                       __LINE__);
+            return -1;
     }
 
     // If there was a problem during creation, make sure it isn't a known
diff --git a/test_conformance/images/clCopyImage/test_copy_generic.cpp b/test_conformance/images/clCopyImage/test_copy_generic.cpp
index 3e0b60d..888ca6e 100644
--- a/test_conformance/images/clCopyImage/test_copy_generic.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_generic.cpp
@@ -519,32 +519,53 @@
     if( gDebugTrace )
         log_info( " - Scanline verification...\n" );
 
-    size_t thirdDim;
-    size_t secondDim;
-    if (dstImageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+    size_t thirdDim = 1;
+    size_t secondDim = 1;
+
+    switch (dstImageInfo->type)
     {
-        secondDim = dstImageInfo->arraySize;
-        thirdDim = 1;
-    }
-    else if (dstImageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
-    {
-        secondDim = dstImageInfo->height;
-        if( gTestMipmaps )
-            secondDim = (dstImageInfo->height >> dst_lod) ? (dstImageInfo->height >> dst_lod):1;
-        thirdDim = dstImageInfo->arraySize;
-    }
-    else
-    {
-        secondDim = dstImageInfo->height;
-        thirdDim = dstImageInfo->depth;
-        if( gTestMipmaps )
-        {
-            secondDim = (dstImageInfo->height >> dst_lod) ? (dstImageInfo->height >> dst_lod):1;
-            if(dstImageInfo->type == CL_MEM_OBJECT_IMAGE3D)
-                thirdDim = (dstImageInfo->depth >> dst_lod) ? (dstImageInfo->depth >> dst_lod):1;
+        case CL_MEM_OBJECT_IMAGE1D_ARRAY: {
+            secondDim = dstImageInfo->arraySize;
+            break;
+        }
+        case CL_MEM_OBJECT_IMAGE2D_ARRAY: {
+            secondDim = dstImageInfo->height;
+            thirdDim = dstImageInfo->arraySize;
+            break;
+        }
+        case CL_MEM_OBJECT_IMAGE3D: {
+            secondDim = dstImageInfo->height;
+            thirdDim = dstImageInfo->depth;
+            break;
+        }
+        case CL_MEM_OBJECT_IMAGE2D: {
+            secondDim = dstImageInfo->height;
+            break;
+        }
+        case CL_MEM_OBJECT_IMAGE1D: {
+            break;
+        }
+        default: {
+            log_error("ERROR: Unsupported Image type. \n");
+            return error;
+            break;
         }
     }
-
+    if (gTestMipmaps)
+    {
+        switch (dstImageInfo->type)
+        {
+            case CL_MEM_OBJECT_IMAGE3D:
+                thirdDim = (dstImageInfo->depth >> dst_lod) ? (dstImageInfo->depth >> dst_lod):1;
+                /* Fallthrough */
+            case CL_MEM_OBJECT_IMAGE2D:
+            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+                secondDim = (dstImageInfo->height >> dst_lod)
+                    ? (dstImageInfo->height >> dst_lod)
+                    : 1;
+                break;
+        }
+    }
     for( size_t z = 0; z < thirdDim; z++ )
     {
         for( size_t y = 0; y < secondDim; y++ )
diff --git a/test_conformance/images/clCopyImage/test_loops.cpp b/test_conformance/images/clCopyImage/test_loops.cpp
index 6ee1e53..e839cfd 100644
--- a/test_conformance/images/clCopyImage/test_loops.cpp
+++ b/test_conformance/images/clCopyImage/test_loops.cpp
@@ -41,60 +41,52 @@
         }
     }
 
-    if( testMethod == k1D )
+    switch (testMethod)
     {
-        name = "1D -> 1D";
-        imageType = CL_MEM_OBJECT_IMAGE1D;
-    }
-    else if( testMethod == k2D )
-    {
-        name = "2D -> 2D";
-        imageType = CL_MEM_OBJECT_IMAGE2D;
-    }
-    else if( testMethod == k3D )
-    {
-        name = "3D -> 3D";
-        imageType = CL_MEM_OBJECT_IMAGE3D;
-    }
-    else if( testMethod == k1DArray )
-    {
-        name = "1D array -> 1D array";
-        imageType = CL_MEM_OBJECT_IMAGE1D_ARRAY;
-    }
-    else if( testMethod == k2DArray )
-    {
-        name = "2D array -> 2D array";
-        imageType = CL_MEM_OBJECT_IMAGE2D_ARRAY;
-    }
-    else if( testMethod == k2DTo3D )
-    {
-        name = "2D -> 3D";
-        imageType = CL_MEM_OBJECT_IMAGE3D;
-    }
-    else if( testMethod == k3DTo2D )
-    {
-        name = "3D -> 2D";
-        imageType = CL_MEM_OBJECT_IMAGE3D;
-    }
-    else if( testMethod == k2DArrayTo2D )
-    {
-        name = "2D array -> 2D";
-        imageType = CL_MEM_OBJECT_IMAGE2D_ARRAY;
-    }
-    else if( testMethod == k2DTo2DArray )
-    {
-        name = "2D -> 2D array";
-        imageType = CL_MEM_OBJECT_IMAGE2D_ARRAY;
-    }
-    else if( testMethod == k2DArrayTo3D )
-    {
-        name = "2D array -> 3D";
-        imageType = CL_MEM_OBJECT_IMAGE3D;
-    }
-    else if( testMethod == k3DTo2DArray )
-    {
-        name = "3D -> 2D array";
-        imageType = CL_MEM_OBJECT_IMAGE3D;
+        case k1D:
+            name = "1D -> 1D";
+            imageType = CL_MEM_OBJECT_IMAGE1D;
+            break;
+        case k2D:
+            name = "2D -> 2D";
+            imageType = CL_MEM_OBJECT_IMAGE2D;
+            break;
+        case k3D:
+            name = "3D -> 3D";
+            imageType = CL_MEM_OBJECT_IMAGE3D;
+            break;
+        case k1DArray:
+            name = "1D array -> 1D array";
+            imageType = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+            break;
+        case k2DArray:
+            name = "2D array -> 2D array";
+            imageType = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+            break;
+        case k2DTo3D:
+            name = "2D -> 3D";
+            imageType = CL_MEM_OBJECT_IMAGE3D;
+            break;
+        case k3DTo2D:
+            name = "3D -> 2D";
+            imageType = CL_MEM_OBJECT_IMAGE3D;
+            break;
+        case k2DArrayTo2D:
+            name = "2D array -> 2D";
+            imageType = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+            break;
+        case k2DTo2DArray:
+            name = "2D -> 2D array";
+            imageType = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+            break;
+        case k2DArrayTo3D:
+            name = "2D array -> 3D";
+            imageType = CL_MEM_OBJECT_IMAGE3D;
+            break;
+        case k3DTo2DArray:
+            name = "3D -> 2D array";
+            imageType = CL_MEM_OBJECT_IMAGE3D;
+            break;
     }
 
     if(gTestMipmaps)
diff --git a/test_conformance/images/clFillImage/test_loops.cpp b/test_conformance/images/clFillImage/test_loops.cpp
index 759f48d..126ea0e 100644
--- a/test_conformance/images/clFillImage/test_loops.cpp
+++ b/test_conformance/images/clFillImage/test_loops.cpp
@@ -33,35 +33,34 @@
     cl_mem_object_type imageType;
     test_func test_fn;
 
-    if ( testMethod == k1D )
+    switch (testMethod)
     {
-        name = "1D Image Fill";
-        imageType = CL_MEM_OBJECT_IMAGE1D;
-        test_fn = &test_fill_image_set_1D;
-    }
-    else if ( testMethod == k2D )
-    {
-        name = "2D Image Fill";
-        imageType = CL_MEM_OBJECT_IMAGE2D;
-        test_fn = &test_fill_image_set_2D;
-    }
-    else if ( testMethod == k1DArray )
-    {
-        name = "1D Image Array Fill";
-        imageType = CL_MEM_OBJECT_IMAGE1D_ARRAY;
-        test_fn = &test_fill_image_set_1D_array;
-    }
-    else if ( testMethod == k2DArray )
-    {
-        name = "2D Image Array Fill";
-        imageType = CL_MEM_OBJECT_IMAGE2D_ARRAY;
-        test_fn = &test_fill_image_set_2D_array;
-    }
-    else if ( testMethod == k3D )
-    {
-        name = "3D Image Fill";
-        imageType = CL_MEM_OBJECT_IMAGE3D;
-        test_fn = &test_fill_image_set_3D;
+        case k1D:
+            name = "1D Image Fill";
+            imageType = CL_MEM_OBJECT_IMAGE1D;
+            test_fn = &test_fill_image_set_1D;
+            break;
+        case k2D:
+            name = "2D Image Fill";
+            imageType = CL_MEM_OBJECT_IMAGE2D;
+            test_fn = &test_fill_image_set_2D;
+            break;
+        case k1DArray:
+            name = "1D Image Array Fill";
+            imageType = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+            test_fn = &test_fill_image_set_1D_array;
+            break;
+        case k2DArray:
+            name = "2D Image Array Fill";
+            imageType = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+            test_fn = &test_fill_image_set_2D_array;
+            break;
+        case k3D:
+            name = "3D Image Fill";
+            imageType = CL_MEM_OBJECT_IMAGE3D;
+            test_fn = &test_fill_image_set_3D;
+            break;
+        default: log_error("Unhandled method\n"); return -1;
     }
 
     log_info( "Running %s tests...\n", name );
diff --git a/test_conformance/images/kernel_read_write/CMakeLists.txt b/test_conformance/images/kernel_read_write/CMakeLists.txt
index b5527c7..d7e7ede 100644
--- a/test_conformance/images/kernel_read_write/CMakeLists.txt
+++ b/test_conformance/images/kernel_read_write/CMakeLists.txt
@@ -21,7 +21,7 @@
 
 # Make unused variables not fatal in this module; see
 # https://github.com/KhronosGroup/OpenCL-CTS/issues/1484
-set_gnulike_module_compile_flags("-Wno-error=unused-variable -Wno-unused-but-set-variable")
+set_gnulike_module_compile_flags("-Wno-error=unused-variable -Wno-unused-but-set-variable -Wno-sign-compare")
 
 include(../../CMakeCommon.txt)
 
diff --git a/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp b/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp
index c664633..887c9dc 100644
--- a/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp
+++ b/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp
@@ -14,8 +14,8 @@
 // limitations under the License.
 //
 
-#ifndef _TEST_CL_EXT_IMAGE_BUFFER
-#define _TEST_CL_EXT_IMAGE_BUFFER
+#ifndef TEST_CL_EXT_IMAGE_BUFFER
+#define TEST_CL_EXT_IMAGE_BUFFER
 
 #define TEST_IMAGE_SIZE 20
 
@@ -48,8 +48,10 @@
     cl_image_desc image_desc = { 0 };
     image_desc.image_type = imageType;
 
-    /* Size 1 only to query element size */
-    image_desc.image_width = 1;
+    /* We use a width of 4 to query element size, as this is
+       the smallest possible value that satisfies the requirements
+       of all image formats (including extensions). */
+    image_desc.image_width = 4;
     if (CL_MEM_OBJECT_IMAGE1D_BUFFER != imageType
         && CL_MEM_OBJECT_IMAGE1D != imageType)
     {
@@ -121,4 +123,4 @@
     }
 }
 
-#endif /* _TEST_CL_EXT_IMAGE_BUFFER */
\ No newline at end of file
+#endif // TEST_CL_EXT_IMAGE_BUFFER
diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt
index 3281402..a221f05 100644
--- a/test_conformance/math_brute_force/CMakeLists.txt
+++ b/test_conformance/math_brute_force/CMakeLists.txt
@@ -45,4 +45,6 @@
 # warnings), but other tests not (yet); so enable -Wall locally.
 set_gnulike_module_compile_flags("-Wall -Wno-strict-aliasing -Wno-unknown-pragmas")
 
+add_cxx_flag_if_supported(-ffp-contract=off)
+
 include(../CMakeCommon.txt)
diff --git a/test_conformance/math_brute_force/i_unary_double.cpp b/test_conformance/math_brute_force/i_unary_double.cpp
index 3d6ce15..953c33b 100644
--- a/test_conformance/math_brute_force/i_unary_double.cpp
+++ b/test_conformance/math_brute_force/i_unary_double.cpp
@@ -50,11 +50,6 @@
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
-    // This test is not using ThreadPool so we need to disable FTZ here
-    // for reference computations
-    FPU_mode_type oldMode;
-    DisableFTZ(&oldMode);
-
     Force64BitFPUPrecision();
 
     // Init the kernels
@@ -227,6 +222,5 @@
     vlog("\n");
 
 exit:
-    RestoreFPState(&oldMode);
     return error;
 }
diff --git a/test_conformance/math_brute_force/i_unary_float.cpp b/test_conformance/math_brute_force/i_unary_float.cpp
index 94ebc66..0ce37cc 100644
--- a/test_conformance/math_brute_force/i_unary_float.cpp
+++ b/test_conformance/math_brute_force/i_unary_float.cpp
@@ -49,11 +49,6 @@
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
-    // This test is not using ThreadPool so we need to disable FTZ here
-    // for reference computations
-    FPU_mode_type oldMode;
-    DisableFTZ(&oldMode);
-
     Force64BitFPUPrecision();
 
     // Init the kernels
@@ -225,6 +220,5 @@
     vlog("\n");
 
 exit:
-    RestoreFPState(&oldMode);
     return error;
 }
diff --git a/test_conformance/math_brute_force/macro_unary_float.cpp b/test_conformance/math_brute_force/macro_unary_float.cpp
index 5367978..34f49a5 100644
--- a/test_conformance/math_brute_force/macro_unary_float.cpp
+++ b/test_conformance/math_brute_force/macro_unary_float.cpp
@@ -81,7 +81,6 @@
     int ftz = job->ftz;
     bool relaxedMode = job->relaxedMode;
     cl_int error = CL_SUCCESS;
-    cl_int ret = CL_SUCCESS;
     const char *name = job->f->name;
 
     int signbit_test = 0;
@@ -245,8 +244,7 @@
                 if (q[j] > t[j]) err = q[j] - t[j];
                 vlog_error("\nERROR: %s: %d ulp error at %a: *%d vs. %d\n",
                            name, err, ((float *)s)[j], t[j], q[j]);
-                error = -1;
-                goto exit;
+                return -1;
             }
 
 
@@ -272,15 +270,12 @@
                     vlog_error(
                         "\nERROR: %s%s: %d ulp error at %a: *%d vs. %d\n", name,
                         sizeNames[k], err, ((float *)s)[j], -t[j], q[j]);
-                    error = -1;
-                    goto exit;
+                    return -1;
                 }
             }
         }
     }
 
-exit:
-    ret = error;
     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
@@ -315,7 +310,7 @@
         fflush(stdout);
     }
 
-    return ret;
+    return CL_SUCCESS;
 }
 
 } // anonymous namespace
diff --git a/test_conformance/math_brute_force/unary_two_results_float.cpp b/test_conformance/math_brute_force/unary_two_results_float.cpp
index 74c5a16..8d42340 100644
--- a/test_conformance/math_brute_force/unary_two_results_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_float.cpp
@@ -189,12 +189,11 @@
         // Get that moving
         if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        FPU_mode_type oldMode;
+        FPU_mode_type oldMode = 0;
         RoundingMode oldRoundMode = kRoundToNearestEven;
         if (isFract)
         {
             // Calculate the correctly rounded reference result
-            memset(&oldMode, 0, sizeof(oldMode));
             if (ftz || relaxedMode) ForceFTZ(&oldMode);
 
             // Set the rounding mode to match the device
diff --git a/test_conformance/mem_host_flags/C_host_memory_block.h b/test_conformance/mem_host_flags/C_host_memory_block.h
index 78692d1..0784c2c 100644
--- a/test_conformance/mem_host_flags/C_host_memory_block.h
+++ b/test_conformance/mem_host_flags/C_host_memory_block.h
@@ -24,14 +24,14 @@
 
 template <class T> class C_host_memory_block {
 public:
-    int num_elements;
+    size_t num_elements;
     int element_size;
     T *pData;
 
     C_host_memory_block();
     ~C_host_memory_block();
-    void Init(int num_elem, T &value);
-    void Init(int num_elem);
+    void Init(size_t num_elem, T &value);
+    void Init(size_t num_elem);
     void Set_to(T &val);
     void Set_to_zero();
     bool Equal_to(T &val);
@@ -40,7 +40,7 @@
     bool Equal_rect(C_host_memory_block<T> &another, size_t *host_origin,
                     size_t *region, size_t host_row_pitch,
                     size_t host_slice_pitch);
-    bool Equal(T *pData, int num_elements);
+    bool Equal(T *pData, size_t num_elements);
 
     bool Equal_rect_from_orig(C_host_memory_block<T> &another, size_t *soffset,
                               size_t *region, size_t host_row_pitch,
@@ -63,20 +63,20 @@
     num_elements = 0;
 }
 
-template <class T> void C_host_memory_block<T>::Init(int num_elem, T &value)
+template <class T> void C_host_memory_block<T>::Init(size_t num_elem, T &value)
 {
     if (pData != NULL) delete[] pData;
     pData = new T[num_elem];
-    for (int i = 0; i < num_elem; i++) pData[i] = value;
+    for (size_t i = 0; i < num_elem; i++) pData[i] = value;
 
     num_elements = num_elem;
 }
 
-template <class T> void C_host_memory_block<T>::Init(int num_elem)
+template <class T> void C_host_memory_block<T>::Init(size_t num_elem)
 {
     if (pData != NULL) delete[] pData;
     pData = new T[num_elem];
-    for (int i = 0; i < num_elem; i++) pData[i] = (T)i;
+    for (size_t i = 0; i < num_elem; i++) pData[i] = (T)i;
 
     num_elements = num_elem;
 }
@@ -88,14 +88,14 @@
 
 template <class T> void C_host_memory_block<T>::Set_to(T &val)
 {
-    for (int i = 0; i < num_elements; i++) pData[i] = val;
+    for (size_t i = 0; i < num_elements; i++) pData[i] = val;
 }
 
 template <class T> bool C_host_memory_block<T>::Equal_to(T &val)
 {
-    int count = 0;
+    size_t count = 0;
 
-    for (int i = 0; i < num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         if (pData[i] == val) count++;
     }
@@ -106,9 +106,9 @@
 template <class T>
 bool C_host_memory_block<T>::Equal(C_host_memory_block<T> &another)
 {
-    int count = 0;
+    size_t count = 0;
 
-    for (int i = 0; i < num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         if (pData[i] == another.pData[i]) count++;
     }
@@ -117,13 +117,13 @@
 }
 
 template <class T>
-bool C_host_memory_block<T>::Equal(T *pIn_Data, int Innum_elements)
+bool C_host_memory_block<T>::Equal(T *pIn_Data, size_t Innum_elements)
 {
     if (this->num_elements != Innum_elements) return false;
 
-    int count = 0;
+    size_t count = 0;
 
-    for (int i = 0; i < num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         if (pData[i] == pIn_Data[i]) count++;
     }
@@ -134,7 +134,7 @@
 template <class T> size_t C_host_memory_block<T>::Count(T &val)
 {
     size_t count = 0;
-    for (int i = 0; i < num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         if (pData[i] == val) count++;
     }
diff --git a/test_conformance/mem_host_flags/checker.h b/test_conformance/mem_host_flags/checker.h
index 835f120..0bb826f 100644
--- a/test_conformance/mem_host_flags/checker.h
+++ b/test_conformance/mem_host_flags/checker.h
@@ -219,7 +219,7 @@
         err = CL_SUCCESS;
     }
 
-    cl_mem_flags f;
+    cl_mem_flags f = 0;
     if (parent_buffer_flag & CL_MEM_HOST_READ_ONLY)
         f = CL_MEM_HOST_READ_ONLY;
     else if (parent_buffer_flag & CL_MEM_HOST_WRITE_ONLY)
diff --git a/test_conformance/non_uniform_work_group/CMakeLists.txt b/test_conformance/non_uniform_work_group/CMakeLists.txt
index f78dd19..30c3a84 100644
--- a/test_conformance/non_uniform_work_group/CMakeLists.txt
+++ b/test_conformance/non_uniform_work_group/CMakeLists.txt
@@ -10,8 +10,6 @@
     tools.cpp
 )
 
-set_gnulike_module_compile_flags("-Wno-unused-but-set-variable")
-
 include(../CMakeCommon.txt)
 
 # end of file #
diff --git a/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.cpp b/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.cpp
index a4a6a74..44781ca 100644
--- a/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.cpp
+++ b/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.cpp
@@ -448,13 +448,8 @@
 }
 
 void TestNonUniformWorkGroup::calculateExpectedValues () {
-  size_t nonRemainderGlobalSize[MAX_DIMS];
   size_t numberOfPossibleRegions[MAX_DIMS];
 
-  nonRemainderGlobalSize[0] = _globalSize[0] - (_globalSize[0] % _enqueuedLocalSize[0]);
-  nonRemainderGlobalSize[1] = _globalSize[1] - (_globalSize[1] % _enqueuedLocalSize[1]);
-  nonRemainderGlobalSize[2] = _globalSize[2] - (_globalSize[2] % _enqueuedLocalSize[2]);
-
   numberOfPossibleRegions[0] = (_globalSize[0]>1)?2:1;
   numberOfPossibleRegions[1] = (_globalSize[1]>1)?2:1;
   numberOfPossibleRegions[2] = (_globalSize[2]>1)?2:1;
@@ -502,6 +497,11 @@
   if (TestNonUniformWorkGroup::_maxLocalWorkgroupSize == 0) {
     err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
       sizeof(TestNonUniformWorkGroup::_maxLocalWorkgroupSize), &TestNonUniformWorkGroup::_maxLocalWorkgroupSize, NULL);
+    if (err)
+    {
+        log_error("clGetDeviceInfo failed\n");
+        return 0;
+    }
   }
 
   return TestNonUniformWorkGroup::_maxLocalWorkgroupSize;
diff --git a/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.h b/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.h
index 414d100..f584606 100644
--- a/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.h
+++ b/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _TESTNONUNIFORMWORKGROUP_H
-#define _TESTNONUNIFORMWORKGROUP_H
+#ifndef TESTNONUNIFORMWORKGROUP_H
+#define TESTNONUNIFORMWORKGROUP_H
 
 #include "procs.h"
 #include <vector>
@@ -147,5 +147,4 @@
   unsigned int _overallCounter;
 };
 
-#endif // _TESTNONUNIFORMWORKGROUP_H
-
+#endif // TESTNONUNIFORMWORKGROUP_H
diff --git a/test_conformance/non_uniform_work_group/tools.h b/test_conformance/non_uniform_work_group/tools.h
index 2e63c3d..ba01fc9 100644
--- a/test_conformance/non_uniform_work_group/tools.h
+++ b/test_conformance/non_uniform_work_group/tools.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _TOOLS_H
-#define _TOOLS_H
+#ifndef TOOLS_H
+#define TOOLS_H
 
 #include "procs.h"
 #include <vector>
@@ -106,4 +106,4 @@
   };
 
 }
-#endif // _TOOLS_H
+#endif // TOOLS_H
diff --git a/test_conformance/pipes/kernels.h b/test_conformance/pipes/kernels.h
index a2fb70c..a897e5e 100644
--- a/test_conformance/pipes/kernels.h
+++ b/test_conformance/pipes/kernels.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _KERNELS_H_
-#define _KERNELS_H_
+#ifndef KERNELS_H_
+#define KERNELS_H_
 
 static const char* pipe_readwrite_struct_kernel_code = {
     "typedef struct{\n"
@@ -127,4 +127,4 @@
     "    read_pipe(in_pipe, &dst[gid]);\n"
     "}\n" };
 
-#endif //_KERNELS_H_
+#endif // KERNELS_H_
diff --git a/test_conformance/pipes/test_pipe_limits.cpp b/test_conformance/pipes/test_pipe_limits.cpp
index e1048f5..76b80b1 100644
--- a/test_conformance/pipes/test_pipe_limits.cpp
+++ b/test_conformance/pipes/test_pipe_limits.cpp
@@ -274,8 +274,7 @@
     size_t global_work_size[3];
     cl_int err;
     size_t size;
-    int num_pipe_elements = 1024;
-    int i;
+    cl_uint num_pipe_elements = 1024;
     cl_uint max_pipe_packet_size;
     clEventWrapper producer_sync_event = NULL;
     clEventWrapper consumer_sync_event = NULL;
@@ -287,7 +286,7 @@
 
     size_t min_alignment = get_min_alignment(context);
 
-    global_work_size[0] = (cl_uint)num_pipe_elements;
+    global_work_size[0] = num_pipe_elements;
 
     std::stringstream source;
 
@@ -312,7 +311,8 @@
 
     inptr = (cl_char *)align_malloc(size, min_alignment);
 
-    for(i = 0; i < size; i++){
+    for (size_t i = 0; i < size; i++)
+    {
         inptr[i] = (char)genrand_int32(d);
     }
     BufferInPtr.reset(inptr, nullptr, 0, size, true);
@@ -412,7 +412,7 @@
     clMemWrapper buf_reserve_id_t_size_aligned;
     cl_int *inptr;
     void *outptr;
-    int size, i;
+    int size;
     clProgramWrapper program;
     clKernelWrapper kernel[3];
     size_t global_work_size[3];
@@ -565,7 +565,8 @@
     size = sizeof(cl_int) * max_active_reservations;
     inptr = (cl_int *)align_malloc(size, min_alignment);
 
-    for(i = 0; i < max_active_reservations; i++){
+    for (cl_uint i = 0; i < max_active_reservations; i++)
+    {
         inptr[i] = (int)genrand_int32(d);
     }
     BufferInPtr.reset(inptr, nullptr, 0, size, true);
diff --git a/test_conformance/relationals/test_comparisons_fp.cpp b/test_conformance/relationals/test_comparisons_fp.cpp
index 580b742..73ff3dd 100644
--- a/test_conformance/relationals/test_comparisons_fp.cpp
+++ b/test_conformance/relationals/test_comparisons_fp.cpp
@@ -14,12 +14,16 @@
 // limitations under the License.
 //
 
+#include <cstdint>
+#include <functional>
 #include <iostream>
 #include <map>
 #include <memory>
 #include <stdexcept>
 #include <vector>
 
+#include "harness/stringHelpers.h"
+
 #include <CL/cl_half.h>
 
 #include "test_comparisons_fp.h"
@@ -81,29 +85,6 @@
 // clang-format on
 
 
-std::string concat_kernel(const char* sstr[], int num)
-{
-    std::string res;
-    for (int i = 0; i < num; i++) res += std::string(sstr[i]);
-    return res;
-}
-
-template <typename... Args>
-std::string string_format(const std::string& format, Args... args)
-{
-    int size_s = std::snprintf(nullptr, 0, format.c_str(), args...)
-        + 1; // Extra space for '\0'
-    if (size_s <= 0)
-    {
-        throw std::runtime_error("Error during formatting.");
-    }
-    auto size = static_cast<size_t>(size_s);
-    std::unique_ptr<char[]> buf(new char[size]);
-    std::snprintf(buf.get(), size, format.c_str(), args...);
-    return std::string(buf.get(),
-                       buf.get() + size - 1); // We don't want the '\0' inside
-}
-
 template <typename T, typename F> bool verify(const T& A, const T& B)
 {
     return F()(A, B);
@@ -224,14 +205,14 @@
             auto str =
                 concat_kernel(equivTestKerPat_3,
                               sizeof(equivTestKerPat_3) / sizeof(const char*));
-            kernelSource = string_format(str, fnName.c_str(), opName.c_str());
+            kernelSource = str_sprintf(str, fnName.c_str(), opName.c_str());
         }
         else
         {
             auto str = concat_kernel(equivTestKerPatLessGreater_3,
                                      sizeof(equivTestKerPatLessGreater_3)
                                          / sizeof(const char*));
-            kernelSource = string_format(str, fnName.c_str());
+            kernelSource = str_sprintf(str, fnName.c_str());
         }
     }
     else
@@ -241,14 +222,14 @@
             auto str =
                 concat_kernel(equivTestKernPat,
                               sizeof(equivTestKernPat) / sizeof(const char*));
-            kernelSource = string_format(str, fnName.c_str(), opName.c_str());
+            kernelSource = str_sprintf(str, fnName.c_str(), opName.c_str());
         }
         else
         {
             auto str = concat_kernel(equivTestKernPatLessGreater,
                                      sizeof(equivTestKernPatLessGreater)
                                          / sizeof(const char*));
-            kernelSource = string_format(str, fnName.c_str());
+            kernelSource = str_sprintf(str, fnName.c_str());
         }
     }
 
diff --git a/test_conformance/relationals/test_comparisons_fp.h b/test_conformance/relationals/test_comparisons_fp.h
index 7faca1c..3401163 100644
--- a/test_conformance/relationals/test_comparisons_fp.h
+++ b/test_conformance/relationals/test_comparisons_fp.h
@@ -14,8 +14,8 @@
 // limitations under the License.
 //
 
-#ifndef _TEST_COMPARISONS_FP_H
-#define _TEST_COMPARISONS_FP_H
+#ifndef TEST_COMPARISONS_FP_H
+#define TEST_COMPARISONS_FP_H
 
 #include <map>
 #include <memory>
@@ -32,6 +32,7 @@
 struct RelTestBase
 {
     explicit RelTestBase(const ExplicitTypes &dt): dataType(dt) {}
+    virtual ~RelTestBase() = default;
     ExplicitTypes dataType;
 };
 
@@ -224,4 +225,4 @@
     return TEST_PASS;
 }
 
-#endif // _TEST_COMPARISONS_FP_H
+#endif // TEST_COMPARISONS_FP_H
diff --git a/test_conformance/select/test_select.cpp b/test_conformance/select/test_select.cpp
index 7fa3bc0..72be08c 100644
--- a/test_conformance/select/test_select.cpp
+++ b/test_conformance/select/test_select.cpp
@@ -14,11 +14,16 @@
 // limitations under the License.
 //
 #include "harness/compat.h"
+#include "harness/typeWrappers.h"
 
 #include <assert.h>
 #include <stdio.h>
 #include <time.h>
 #include <string.h>
+
+#include <cinttypes>
+#include <vector>
+
 #if ! defined( _WIN32)
 #if defined(__APPLE__)
 #include <sys/sysctl.h>
@@ -42,11 +47,14 @@
 
 // initialize the valued used to compare with in the select with
 // vlaues [start, count)
-static void initCmpBuffer(void* cmp, Type cmptype, uint64_t start, size_t count);
+static void initCmpBuffer(void *cmp, Type cmptype, uint64_t start,
+                          const size_t count);
 
 // make a program that uses select for the given stype (src/dest type),
 // ctype (comparison type), veclen (vector length)
-static cl_program makeSelectProgram(cl_kernel *kernel_ptr, const cl_context context, Type stype, Type ctype, size_t veclen );
+static cl_program makeSelectProgram(cl_kernel *kernel_ptr, cl_context context,
+                                    Type stype, Type ctype,
+                                    const size_t veclen);
 
 // Creates and execute the select test for the given device, context,
 // stype (source/dest type), cmptype (comparison type), using max_tg_size
@@ -66,6 +74,16 @@
 #define BUFFER_SIZE (1024*1024)
 #define KPAGESIZE 4096
 
+#define test_error_count(errCode, msg)                                         \
+    {                                                                          \
+        auto errCodeResult = errCode;                                          \
+        if (errCodeResult != CL_SUCCESS)                                       \
+        {                                                                      \
+            gFailCount++;                                                      \
+            print_error(errCodeResult, msg);                                   \
+            return errCode;                                                    \
+        }                                                                      \
+    }
 
 // When we indicate non wimpy mode, the types that are 32 bits value will
 // test their entire range and 64 bits test will test the 32 bit
@@ -74,12 +92,6 @@
 static bool  s_wimpy_mode = false;
 static int s_wimpy_reduction_factor = 256;
 
-// Tests are broken into the major test which is based on the
-// src and cmp type and their corresponding vector types and
-// sub tests which is for each individual test.  The following
-// tracks the subtests
-int s_test_cnt = 0;
-
 //-----------------------------------------
 // Static helper functions
 //-----------------------------------------
@@ -112,36 +124,37 @@
         s1[i]   = genrand_int32(d);
 }
 
-static void initCmpBuffer(void* cmp, Type cmptype, uint64_t start, size_t count) {
-    int i;
+static void initCmpBuffer(void *cmp, Type cmptype, uint64_t start,
+                          const size_t count)
+
+{
     assert(cmptype != kfloat);
     switch (type_size[cmptype]) {
         case 1: {
             uint8_t* ub = (uint8_t *)cmp;
-            for (i=0; i < count; ++i)
-                ub[i] = (uint8_t)start++;
+            for (size_t i = 0; i < count; ++i) ub[i] = (uint8_t)start++;
             break;
         }
         case 2: {
             uint16_t* us = (uint16_t *)cmp;
-            for (i=0; i < count; ++i)
-                us[i] = (uint16_t)start++;
+            for (size_t i = 0; i < count; ++i) us[i] = (uint16_t)start++;
             break;
         }
         case 4: {
             if (!s_wimpy_mode) {
                 uint32_t* ui = (uint32_t *)cmp;
-                for (i=0; i < count; ++i)
-                    ui[i] = (uint32_t)start++;
+                for (size_t i = 0; i < count; ++i) ui[i] = (uint32_t)start++;
             }
             else {
                 // The short test doesn't iterate over the entire 32 bit space so
                 // we alternate between positive and negative values
                 int32_t* ui = (int32_t *)cmp;
-                int32_t sign = 1;
-                for (i=0; i < count; ++i, ++start) {
-                    ui[i] = (int32_t)start*sign;
-                    sign = sign * -1;
+                int32_t neg_start = (int32_t)start * -1;
+                for (size_t i = 0; i < count; i++)
+                {
+                    ++start;
+                    --neg_start;
+                    ui[i] = (int32_t)((i % 2) ? start : neg_start);
                 }
             }
             break;
@@ -150,10 +163,12 @@
             // We don't iterate over the entire space of 64 bit so for the
             // selects, we want to test positive and negative values
             int64_t* ll = (int64_t *)cmp;
-            int64_t sign = 1;
-            for (i=0; i < count; ++i, ++start) {
-                ll[i] = start*sign;
-                sign = sign * -1;
+            int64_t neg_start = (int64_t)start * -1;
+            for (size_t i = 0; i < count; i++)
+            {
+                ++start;
+                --neg_start;
+                ll[i] = (int64_t)((i % 2) ? start : neg_start);
             }
             break;
         }
@@ -165,7 +180,9 @@
 // Make the various incarnations of the program we want to run
 //  stype: source and destination type for the select
 //  ctype: compare type
-static cl_program makeSelectProgram(cl_kernel *kernel_ptr, const cl_context context, Type srctype, Type cmptype, size_t vec_len)
+static cl_program makeSelectProgram(cl_kernel *kernel_ptr,
+                                    const cl_context context, Type srctype,
+                                    Type cmptype, const size_t vec_len)
 {
     char testname[256];
     char stypename[32];
@@ -237,6 +254,9 @@
     if (srctype == kdouble)
         strcpy( extension, "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" );
 
+    if (srctype == khalf)
+        strcpy(extension, "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n");
+
     // create type name and testname
     switch( vec_len )
     {
@@ -288,39 +308,38 @@
     return program;
 }
 
-
 #define VECTOR_SIZE_COUNT   6
 
 static int doTest(cl_command_queue queue, cl_context context, Type stype, Type cmptype, cl_device_id device)
 {
     int err = CL_SUCCESS;
-    int s_test_fail = 0;
-    MTdataHolder d;
+    MTdataHolder d(gRandomSeed);
     const size_t element_count[VECTOR_SIZE_COUNT] = { 1, 2, 3, 4, 8, 16 };
-    cl_mem src1 = NULL;
-    cl_mem src2 = NULL;
-    cl_mem cmp = NULL;
-    cl_mem dest = NULL;
-    void *ref = NULL;
-    void *sref = NULL;
+    clMemWrapper src1, src2, cmp, dest;
 
     cl_ulong blocks = type_size[stype] * 0x100000000ULL / BUFFER_SIZE;
-    size_t block_elements = BUFFER_SIZE / type_size[stype];
+    const size_t block_elements = BUFFER_SIZE / type_size[stype];
     size_t step = s_wimpy_mode ? s_wimpy_reduction_factor : 1;
     cl_ulong cmp_stride = block_elements * step;
 
     // It is more efficient to create the tests all at once since we
     // use the same test data on each of the vector sizes
-    int vecsize;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel  kernels[VECTOR_SIZE_COUNT];
+    clProgramWrapper programs[VECTOR_SIZE_COUNT];
+    clKernelWrapper kernels[VECTOR_SIZE_COUNT];
 
-    if(stype == kdouble && ! is_extension_available( device, "cl_khr_fp64" ))
+    if (stype == kdouble && !is_extension_available(device, "cl_khr_fp64"))
     {
         log_info("Skipping double because cl_khr_fp64 extension is not supported.\n");
         return 0;
     }
 
+    if (stype == khalf && !is_extension_available(device, "cl_khr_fp16"))
+    {
+        log_info(
+            "Skipping half because cl_khr_fp16 extension is not supported.\n");
+        return 0;
+    }
+
     if (gIsEmbedded)
     {
        if (( stype == klong || stype == kulong ) && ! is_extension_available( device, "cles_khr_int64" ))
@@ -336,29 +355,51 @@
        }
     }
 
-    for (vecsize = 0; vecsize < VECTOR_SIZE_COUNT; ++vecsize)
+    src1 = clCreateBuffer( context, CL_MEM_READ_ONLY, BUFFER_SIZE, NULL, &err );
+    test_error_count(err, "Error: could not allocate src1 buffer\n");
+    src2 = clCreateBuffer( context, CL_MEM_READ_ONLY, BUFFER_SIZE, NULL, &err );
+    test_error_count(err, "Error: could not allocate src2 buffer\n");
+    cmp = clCreateBuffer( context, CL_MEM_READ_ONLY, BUFFER_SIZE, NULL, &err );
+    test_error_count(err, "Error: could not allocate cmp buffer\n");
+    dest = clCreateBuffer( context, CL_MEM_WRITE_ONLY, BUFFER_SIZE, NULL, &err );
+    test_error_count(err, "Error: could not allocate dest buffer\n");
+
+    programs[0] = makeSelectProgram(&kernels[0], context, stype, cmptype,
+                                    element_count[0]);
+    programs[1] = makeSelectProgram(&kernels[1], context, stype, cmptype,
+                                    element_count[1]);
+    programs[2] = makeSelectProgram(&kernels[2], context, stype, cmptype,
+                                    element_count[2]);
+    programs[3] = makeSelectProgram(&kernels[3], context, stype, cmptype,
+                                    element_count[3]);
+    programs[4] = makeSelectProgram(&kernels[4], context, stype, cmptype,
+                                    element_count[4]);
+    programs[5] = makeSelectProgram(&kernels[5], context, stype, cmptype,
+                                    element_count[5]);
+
+    for (size_t vecsize = 0; vecsize < VECTOR_SIZE_COUNT; ++vecsize)
     {
-        programs[vecsize] = makeSelectProgram(&kernels[vecsize], context, stype, cmptype, element_count[vecsize] );
-        if (!programs[vecsize] || !kernels[vecsize]) {
-            ++s_test_fail;
-            ++s_test_cnt;
+        if (!programs[vecsize] || !kernels[vecsize])
+        {
             return -1;
         }
+
+        err = clSetKernelArg(kernels[vecsize], 0, sizeof dest, &dest);
+        test_error_count(err, "Error: Cannot set kernel arg dest!\n");
+        err = clSetKernelArg(kernels[vecsize], 1, sizeof src1, &src1);
+        test_error_count(err, "Error: Cannot set kernel arg dest!\n");
+        err = clSetKernelArg(kernels[vecsize], 2, sizeof src2, &src2);
+        test_error_count(err, "Error: Cannot set kernel arg dest!\n");
+        err = clSetKernelArg(kernels[vecsize], 3, sizeof cmp, &cmp);
+        test_error_count(err, "Error: Cannot set kernel arg dest!\n");
     }
 
-    ref = malloc( BUFFER_SIZE );
-    if( NULL == ref ){ log_error("Error: could not allocate ref buffer\n" ); goto exit; }
-    sref = malloc( BUFFER_SIZE );
-    if( NULL == sref ){ log_error("Error: could not allocate ref buffer\n" ); goto exit; }
-    src1 = clCreateBuffer( context, CL_MEM_READ_ONLY, BUFFER_SIZE, NULL, &err );
-    if( err ) { log_error( "Error: could not allocate src1 buffer\n" );  ++s_test_fail; goto exit; }
-    src2 = clCreateBuffer( context, CL_MEM_READ_ONLY, BUFFER_SIZE, NULL, &err );
-    if( err ) { log_error( "Error: could not allocate src2 buffer\n" );  ++s_test_fail; goto exit; }
-    cmp = clCreateBuffer( context, CL_MEM_READ_ONLY, BUFFER_SIZE, NULL, &err );
-    if( err ) { log_error( "Error: could not allocate cmp buffer\n" );  ++s_test_fail; goto exit; }
-    dest = clCreateBuffer( context, CL_MEM_WRITE_ONLY, BUFFER_SIZE, NULL, &err );
-    if( err ) { log_error( "Error: could not allocate dest buffer\n" );  ++s_test_fail; goto exit; }
-
+    std::vector<char> ref(BUFFER_SIZE);
+    std::vector<char> sref(BUFFER_SIZE);
+    std::vector<char> src1_host(BUFFER_SIZE);
+    std::vector<char> src2_host(BUFFER_SIZE);
+    std::vector<char> cmp_host(BUFFER_SIZE);
+    std::vector<char> dest_host(BUFFER_SIZE);
 
     // We block the test as we are running over the range of compare values
     // "block the test" means "break the test into blocks"
@@ -368,81 +409,63 @@
         cmp_stride = block_elements * step * (0xffffffffffffffffULL / 0x100000000ULL + 1);
 
     log_info("Testing...");
-    d = MTdataHolder(gRandomSeed);
     uint64_t i;
+
+    initSrcBuffer(src1_host.data(), stype, d);
+    initSrcBuffer(src2_host.data(), stype, d);
     for (i=0; i < blocks; i+=step)
     {
-        void *s1 = clEnqueueMapBuffer( queue, src1, CL_TRUE, CL_MAP_WRITE, 0, BUFFER_SIZE, 0, NULL, NULL, &err );
-        if( err ){ log_error( "Error: Could not map src1" ); goto exit; }
-        // Setup the input data to change for each block
-        initSrcBuffer( s1, stype, d);
+        initCmpBuffer(cmp_host.data(), cmptype, i * cmp_stride, block_elements);
 
-        void *s2 = clEnqueueMapBuffer( queue, src2, CL_TRUE, CL_MAP_WRITE, 0, BUFFER_SIZE, 0, NULL, NULL, &err );
-        if( err ){ log_error( "Error: Could not map src2" ); goto exit; }
-        // Setup the input data to change for each block
-        initSrcBuffer( s2, stype, d);
+        err = clEnqueueWriteBuffer(queue, src1, CL_FALSE, 0, BUFFER_SIZE,
+                                   src1_host.data(), 0, NULL, NULL);
+        test_error_count(err, "Error: Could not write src1");
 
-        void *s3 = clEnqueueMapBuffer( queue, cmp, CL_TRUE, CL_MAP_WRITE, 0, BUFFER_SIZE, 0, NULL, NULL, &err );
-        if( err ){ log_error( "Error: Could not map cmp" ); goto exit; }
-        // Setup the input data to change for each block
-        initCmpBuffer(s3, cmptype, i * cmp_stride, block_elements);
+        err = clEnqueueWriteBuffer(queue, src2, CL_FALSE, 0, BUFFER_SIZE,
+                                   src2_host.data(), 0, NULL, NULL);
+        test_error_count(err, "Error: Could not write src2");
 
-        // Create the reference result
-        Select sfunc = (cmptype == ctype[stype][0]) ? vrefSelects[stype][0] : vrefSelects[stype][1];
-        (*sfunc)(ref, s1, s2, s3, block_elements);
+        err = clEnqueueWriteBuffer(queue, cmp, CL_FALSE, 0, BUFFER_SIZE,
+                                   cmp_host.data(), 0, NULL, NULL);
+        test_error_count(err, "Error: Could not write cmp");
 
-        sfunc = (cmptype == ctype[stype][0]) ? refSelects[stype][0] : refSelects[stype][1];
-        (*sfunc)(sref, s1, s2, s3, block_elements);
+        Select sfunc = (cmptype == ctype[stype][0]) ? vrefSelects[stype][0]
+                                                    : vrefSelects[stype][1];
+        (*sfunc)(ref.data(), src1_host.data(), src2_host.data(),
+                 cmp_host.data(), block_elements);
 
-        if( (err = clEnqueueUnmapMemObject( queue, src1, s1, 0, NULL, NULL )))
-        { log_error( "Error: coult not unmap src1\n" );  ++s_test_fail; goto exit; }
-        if( (err = clEnqueueUnmapMemObject( queue, src2, s2, 0, NULL, NULL )))
-        { log_error( "Error: coult not unmap src2\n" );  ++s_test_fail; goto exit; }
-        if( (err = clEnqueueUnmapMemObject( queue, cmp, s3, 0, NULL, NULL )))
-        { log_error( "Error: coult not unmap cmp\n" );  ++s_test_fail; goto exit; }
+        sfunc = (cmptype == ctype[stype][0]) ? refSelects[stype][0]
+                                             : refSelects[stype][1];
+        (*sfunc)(sref.data(), src1_host.data(), src2_host.data(),
+                 cmp_host.data(), block_elements);
 
-        for (vecsize = 0; vecsize < VECTOR_SIZE_COUNT; ++vecsize)
+        for (int vecsize = 0; vecsize < VECTOR_SIZE_COUNT; ++vecsize)
         {
             size_t vector_size = element_count[vecsize] * type_size[stype];
             size_t vector_count =  (BUFFER_SIZE + vector_size - 1) / vector_size;
 
-            if((err = clSetKernelArg(kernels[vecsize], 0,  sizeof dest, &dest) ))
-            { log_error( "Error: Cannot set kernel arg dest! %d\n", err ); ++s_test_fail; goto exit; }
-            if((err = clSetKernelArg(kernels[vecsize], 1,  sizeof src1, &src1) ))
-            { log_error( "Error: Cannot set kernel arg dest! %d\n", err ); ++s_test_fail; goto exit; }
-            if((err = clSetKernelArg(kernels[vecsize], 2,  sizeof src2, &src2) ))
-            { log_error( "Error: Cannot set kernel arg dest! %d\n", err ); ++s_test_fail; goto exit; }
-            if((err = clSetKernelArg(kernels[vecsize], 3,  sizeof cmp, &cmp) ))
-            { log_error( "Error: Cannot set kernel arg dest! %d\n", err ); ++s_test_fail; goto exit; }
+            const cl_int pattern = -1;
+            err = clEnqueueFillBuffer(queue, dest, &pattern, sizeof(cl_int), 0,
+                                      BUFFER_SIZE, 0, nullptr, nullptr);
+            test_error_count(err, "clEnqueueFillBuffer failed");
 
 
-            // Wipe destination
-            void *d = clEnqueueMapBuffer( queue, dest, CL_TRUE, CL_MAP_WRITE, 0, BUFFER_SIZE, 0, NULL, NULL, &err );
-            if( err ){ log_error( "Error: Could not map dest" );  ++s_test_fail; goto exit; }
-            memset( d, -1, BUFFER_SIZE );
-            if( (err = clEnqueueUnmapMemObject( queue, dest, d, 0, NULL, NULL ) ) ){ log_error( "Error: Could not unmap dest" ); ++s_test_fail; goto exit; }
-
             err = clEnqueueNDRangeKernel(queue, kernels[vecsize], 1, NULL, &vector_count, NULL, 0, NULL, NULL);
-            if (err != CL_SUCCESS) {
-                log_error("clEnqueueNDRangeKernel failed errcode:%d\n", err);
-                ++s_test_fail;
-                goto exit;
-            }
+            test_error_count(err, "clEnqueueNDRangeKernel failed errcode\n");
 
-            d = clEnqueueMapBuffer( queue, dest, CL_TRUE, CL_MAP_READ, 0, BUFFER_SIZE, 0, NULL, NULL, &err );
-            if( err ){ log_error( "Error: Could not map dest # 2" );  ++s_test_fail; goto exit; }
+            err = clEnqueueReadBuffer(queue, dest, CL_TRUE, 0, BUFFER_SIZE,
+                                      dest_host.data(), 0, NULL, NULL);
+            test_error_count(
+                err, "Error: Reading buffer from dest to dest_host failed\n");
 
-            if ((*checkResults[stype])(d, vecsize == 0 ? sref : ref, block_elements, element_count[vecsize])!=0){
-                log_error("vec_size:%d indx: 0x%16.16llx\n", (int)element_count[vecsize], i);
-                ++s_test_fail;
-                goto exit;
-            }
-
-            if( (err = clEnqueueUnmapMemObject( queue, dest, d, 0, NULL, NULL ) ) )
+            if ((*checkResults[stype])(dest_host.data(),
+                                       vecsize == 0 ? sref.data() : ref.data(),
+                                       block_elements, element_count[vecsize])
+                != 0)
             {
-                log_error( "Error: Could not unmap dest" );
-                ++s_test_fail;
-                goto exit;
+                log_error("vec_size:%d indx: 0x%16.16" PRIx64 "\n",
+                          (int)element_count[vecsize], i);
+                return TEST_FAIL;
             }
         } // for vecsize
     } // for i
@@ -452,24 +475,6 @@
     else
         log_info(" Wimpy Passed\n\n");
 
-exit:
-    if( src1 )  clReleaseMemObject( src1 );
-    if( src2 )  clReleaseMemObject( src2 );
-    if( cmp )   clReleaseMemObject( cmp );
-    if( dest)   clReleaseMemObject( dest );
-    if( ref )   free(ref );
-    if( sref )  free(sref );
-
-    for (vecsize = 0; vecsize < VECTOR_SIZE_COUNT; vecsize++) {
-        clReleaseKernel(kernels[vecsize]);
-        clReleaseProgram(programs[vecsize]);
-    }
-    ++s_test_cnt;
-    if (s_test_fail)
-    {
-        err = TEST_FAIL;
-        gFailCount++;
-    }
     return err;
 }
 
@@ -505,6 +510,16 @@
 {
     return doTest(queue, context, kshort, kshort, deviceID);
 }
+int test_select_half_ushort(cl_device_id deviceID, cl_context context,
+                            cl_command_queue queue, int num_elements)
+{
+    return doTest(queue, context, khalf, kushort, deviceID);
+}
+int test_select_half_short(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements)
+{
+    return doTest(queue, context, khalf, kshort, deviceID);
+}
 int test_select_uint_uint(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return doTest(queue, context, kuint, kuint, deviceID);
@@ -555,26 +570,17 @@
 }
 
 test_definition test_list[] = {
-    ADD_TEST( select_uchar_uchar ),
-    ADD_TEST( select_uchar_char ),
-    ADD_TEST( select_char_uchar ),
-    ADD_TEST( select_char_char ),
-    ADD_TEST( select_ushort_ushort ),
-    ADD_TEST( select_ushort_short ),
-    ADD_TEST( select_short_ushort ),
-    ADD_TEST( select_short_short ),
-    ADD_TEST( select_uint_uint ),
-    ADD_TEST( select_uint_int ),
-    ADD_TEST( select_int_uint ),
-    ADD_TEST( select_int_int ),
-    ADD_TEST( select_float_uint ),
-    ADD_TEST( select_float_int ),
-    ADD_TEST( select_ulong_ulong ),
-    ADD_TEST( select_ulong_long ),
-    ADD_TEST( select_long_ulong ),
-    ADD_TEST( select_long_long ),
-    ADD_TEST( select_double_ulong ),
-    ADD_TEST( select_double_long ),
+    ADD_TEST(select_uchar_uchar),   ADD_TEST(select_uchar_char),
+    ADD_TEST(select_char_uchar),    ADD_TEST(select_char_char),
+    ADD_TEST(select_ushort_ushort), ADD_TEST(select_ushort_short),
+    ADD_TEST(select_short_ushort),  ADD_TEST(select_short_short),
+    ADD_TEST(select_half_ushort),   ADD_TEST(select_half_short),
+    ADD_TEST(select_uint_uint),     ADD_TEST(select_uint_int),
+    ADD_TEST(select_int_uint),      ADD_TEST(select_int_int),
+    ADD_TEST(select_float_uint),    ADD_TEST(select_float_int),
+    ADD_TEST(select_ulong_ulong),   ADD_TEST(select_ulong_long),
+    ADD_TEST(select_long_ulong),    ADD_TEST(select_long_long),
+    ADD_TEST(select_double_ulong),  ADD_TEST(select_double_long),
 };
 
 const int test_num = ARRAY_SIZE( test_list );
diff --git a/test_conformance/select/test_select.h b/test_conformance/select/test_select.h
index c51ae13..5cd7860 100644
--- a/test_conformance/select/test_select.h
+++ b/test_conformance/select/test_select.h
@@ -28,18 +28,20 @@
 #endif
 
 // Defines the set of types we support (no support for double)
-typedef enum {
+typedef enum
+{
     kuchar = 0,
     kchar = 1,
     kushort = 2,
     kshort = 3,
-    kuint = 4,
-    kint = 5,
-    kfloat = 6,
-    kulong = 7,
-    klong = 8,
-    kdouble = 9,
-    kTypeCount  // always goes last
+    khalf = 4,
+    kuint = 5,
+    kint = 6,
+    kfloat = 7,
+    kulong = 8,
+    klong = 9,
+    kdouble = 10,
+    kTypeCount // always goes last
 } Type;
 
 
@@ -56,7 +58,8 @@
 extern const Type ctype[kTypeCount][2];
 
 // Reference functions for the primitive (non vector) type
-typedef void (*Select)(void *dest, void *src1, void *src2, void *cmp, size_t c);
+typedef void (*Select)(void *const dest, const void *const src1,
+                       const void *const src2, const void *const cmp, size_t c);
 extern Select refSelects[kTypeCount][2];
 
 // Reference functions for the primtive type but uses the vector
@@ -64,7 +67,8 @@
 extern Select vrefSelects[kTypeCount][2];
 
 // Check functions for each output type
-typedef size_t (*CheckResults)(void *out1, void *out2, size_t count, size_t vectorSize);
+typedef size_t (*CheckResults)(const void *const out1, const void *const out2,
+                               size_t count, size_t vectorSize);
 extern CheckResults checkResults[kTypeCount];
 
 // Helpful macros
diff --git a/test_conformance/select/util_select.cpp b/test_conformance/select/util_select.cpp
index f9641e9..078ff64 100644
--- a/test_conformance/select/util_select.cpp
+++ b/test_conformance/select/util_select.cpp
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "harness/compat.h"
 #include "harness/errorHelpers.h"
 
 #include <stdio.h>
+#include <cinttypes>
 #include "test_select.h"
 
 
@@ -25,29 +25,28 @@
 //-----------------------------------------
 
 
-const char *type_name[kTypeCount] = {
-    "uchar", "char",
-    "ushort", "short",
-    "uint",   "int",
-    "float",  "ulong", "long", "double" };
+const char *type_name[kTypeCount] = { "uchar", "char", "ushort", "short",
+                                      "half",  "uint", "int",    "float",
+                                      "ulong", "long", "double" };
 
 const size_t type_size[kTypeCount] = {
-    sizeof(cl_uchar), sizeof(cl_char),
-    sizeof(cl_ushort), sizeof(cl_short),
-    sizeof(cl_uint), sizeof(cl_int),
-    sizeof(cl_float), sizeof(cl_ulong), sizeof(cl_long), sizeof( cl_double ) };
+    sizeof(cl_uchar), sizeof(cl_char), sizeof(cl_ushort), sizeof(cl_short),
+    sizeof(cl_half),  sizeof(cl_uint), sizeof(cl_int),    sizeof(cl_float),
+    sizeof(cl_ulong), sizeof(cl_long), sizeof(cl_double)
+};
 
 const Type ctype[kTypeCount][2] = {
-    { kuchar,  kchar },     // uchar
-    { kuchar,  kchar },     // char
-    { kushort, kshort},     // ushort
-    { kushort, kshort},     // short
-    { kuint,   kint  },     // uint
-    { kuint,   kint  },     // int
-    { kuint,   kint  },     // float
-    { kulong,  klong },     // ulong
-    { kulong,  klong },     // long
-    { kulong,  klong }     // double
+    { kuchar, kchar }, // uchar
+    { kuchar, kchar }, // char
+    { kushort, kshort }, // ushort
+    { kushort, kshort }, // short
+    { kushort, kshort }, // half
+    { kuint, kint }, // uint
+    { kuint, kint }, // int
+    { kuint, kint }, // float
+    { kulong, klong }, // ulong
+    { kulong, klong }, // long
+    { kulong, klong } // double
 };
 
 
@@ -55,510 +54,594 @@
 // Reference functions
 //-----------------------------------------
 
-void refselect_1i8(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i8(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_char *d, *x, *y, *m;
-    d = (cl_char*) dest;
-    x = (cl_char*) src1;
-    y = (cl_char*) src2;
-    m = (cl_char*) cmp;
+    cl_char *const d = (cl_char *)dest;
+    const cl_char *const x = (cl_char *)src1;
+    const cl_char *const y = (cl_char *)src2;
+    const cl_char *const m = (cl_char *)cmp;
     for (i=0; i < count; ++i) {
         d[i] = m[i] ? y[i] : x[i];
     }
 }
 
-void refselect_1u8(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u8(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uchar *d, *x, *y;
-    cl_char *m;
-    d = (cl_uchar*) dest;
-    x = (cl_uchar*) src1;
-    y = (cl_uchar*) src2;
-    m = (cl_char*) cmp;
+    cl_uchar *const d = (cl_uchar *)dest;
+    const cl_uchar *const x = (cl_uchar *)src1;
+    const cl_uchar *const y = (cl_uchar *)src2;
+    const cl_char *const m = (cl_char *)cmp;
     for (i=0; i < count; ++i) {
         d[i] = m[i] ? y[i] : x[i];
     }
 }
 
-void refselect_1i16(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i16(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_short *d, *x, *y, *m;
-    d = (cl_short*) dest;
-    x = (cl_short*) src1;
-    y = (cl_short*) src2;
-    m = (cl_short*) cmp;
+    cl_short *const d = (cl_short *)dest;
+    const cl_short *const x = (cl_short *)src1;
+    const cl_short *const y = (cl_short *)src2;
+    const cl_short *const m = (cl_short *)cmp;
 
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u16(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u16(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_ushort *d, *x, *y;
-    cl_short *m;
-    d = (cl_ushort*) dest;
-    x = (cl_ushort*) src1;
-    y = (cl_ushort*) src2;
-    m = (cl_short*) cmp;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_short *const m = (cl_short *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i32(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i32(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_int *d, *x, *y, *m;
-    d = (cl_int*)dest;
-    x = (cl_int*)src1;
-    y = (cl_int*)src2;
-    m = (cl_int*)cmp;
+    cl_int *const d = (cl_int *)dest;
+    const cl_int *const x = (cl_int *)src1;
+    const cl_int *const y = (cl_int *)src2;
+    const cl_int *const m = (cl_int *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u32(void *dest, void *src1, void *src2, void *cmp, size_t count){
+void refselect_1u32(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y;
-    cl_int *m;
-    d = (cl_uint*)dest;
-    x = (cl_uint*)src1;
-    y = (cl_uint*)src2;
-    m = (cl_int*)cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_int *const m = (cl_int *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i64(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i64(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y, *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_long*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u64(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u64(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y;
-    cl_long *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_long*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i8u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i8u(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_char *d, *x, *y;
-    cl_uchar *m;
-    d = (cl_char*) dest;
-    x = (cl_char*) src1;
-    y = (cl_char*) src2;
-    m = (cl_uchar*) cmp;
+    cl_char *const d = (cl_char *)dest;
+    const cl_char *const x = (cl_char *)src1;
+    const cl_char *const y = (cl_char *)src2;
+    const cl_uchar *const m = (cl_uchar *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u8u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u8u(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uchar *d, *x, *y, *m;
-    d = (cl_uchar*) dest;
-    x = (cl_uchar*) src1;
-    y = (cl_uchar*) src2;
-    m = (cl_uchar*) cmp;
+    cl_uchar *const d = (cl_uchar *)dest;
+    const cl_uchar *const x = (cl_uchar *)src1;
+    const cl_uchar *const y = (cl_uchar *)src2;
+    const cl_uchar *const m = (cl_uchar *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i16u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i16u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_short *d, *x, *y;
-    cl_ushort *m;
-    d = (cl_short*) dest;
-    x = (cl_short*) src1;
-    y = (cl_short*) src2;
-    m = (cl_ushort*) cmp;
+    cl_short *const d = (cl_short *)dest;
+    const cl_short *const x = (cl_short *)src1;
+    const cl_short *const y = (cl_short *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u16u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u16u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_ushort *d, *x, *y, *m;
-    d = (cl_ushort*) dest;
-    x = (cl_ushort*) src1;
-    y = (cl_ushort*) src2;
-    m = (cl_ushort*) cmp;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i32u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i32u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_int *d, *x, *y;
-    cl_uint *m;
-    d = (cl_int*) dest;
-    x = (cl_int*) src1;
-    y = (cl_int*) src2;
-    m = (cl_uint*) cmp;
+    cl_int *const d = (cl_int *)dest;
+    const cl_int *const x = (cl_int *)src1;
+    const cl_int *const y = (cl_int *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u32u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u32u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y, *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_uint*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i64u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i64u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y;
-    cl_ulong *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_ulong*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u64u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u64u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y, *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_ulong*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_ffi(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_hhi(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_int *d, *x, *y;
-    cl_int *m;
-    d = (cl_int*) dest;
-    x = (cl_int*) src1;
-    y = (cl_int*) src2;
-    m = (cl_int*) cmp;
+    cl_short *const d = (cl_short *)dest;
+    const cl_short *const x = (cl_short *)src1;
+    const cl_short *const y = (cl_short *)src2;
+    const cl_short *const m = (cl_short *)cmp;
+    for (i = 0; i < count; ++i) d[i] = m[i] ? y[i] : x[i];
+}
+
+void refselect_hhu(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
+    size_t i;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
+    for (i = 0; i < count; ++i) d[i] = m[i] ? y[i] : x[i];
+}
+
+void refselect_ffi(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
+    size_t i;
+    cl_int *const d = (cl_int *)dest;
+    const cl_int *const x = (cl_int *)src1;
+    const cl_int *const y = (cl_int *)src2;
+    const cl_int *const m = (cl_int *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_ffu(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_ffu(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y;
-    cl_uint *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_uint*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_ddi(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_ddi(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y;
-    cl_long *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_long*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_ddu(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_ddu(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y;
-    cl_ulong *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_ulong*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void vrefselect_1i8(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i8(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_char *d, *x, *y, *m;
-    d = (cl_char*) dest;
-    x = (cl_char*) src1;
-    y = (cl_char*) src2;
-    m = (cl_char*) cmp;
+    cl_char *const d = (cl_char *)dest;
+    const cl_char *const x = (cl_char *)src1;
+    const cl_char *const y = (cl_char *)src2;
+    const cl_char *const m = (cl_char *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80) ? y[i] : x[i];
 }
 
-void vrefselect_1u8(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u8(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uchar *d, *x, *y;
-    cl_char *m;
-    d = (cl_uchar*) dest;
-    x = (cl_uchar*) src1;
-    y = (cl_uchar*) src2;
-    m = (cl_char*) cmp;
+    cl_uchar *const d = (cl_uchar *)dest;
+    const cl_uchar *const x = (cl_uchar *)src1;
+    const cl_uchar *const y = (cl_uchar *)src2;
+    const cl_char *const m = (cl_char *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80) ? y[i] : x[i];
 }
 
-void vrefselect_1i16(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i16(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_short *d, *x, *y, *m;
-    d = (cl_short*) dest;
-    x = (cl_short*) src1;
-    y = (cl_short*) src2;
-    m = (cl_short*) cmp;
+    cl_short *const d = (cl_short *)dest;
+    const cl_short *const x = (cl_short *)src1;
+    const cl_short *const y = (cl_short *)src2;
+    const cl_short *const m = (cl_short *)cmp;
 
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000) ? y[i] : x[i];
 }
 
-void vrefselect_1u16(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u16(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_ushort *d, *x, *y;
-    cl_short *m;
-    d = (cl_ushort*) dest;
-    x = (cl_ushort*)src1;
-    y = (cl_ushort*)src2;
-    m = (cl_short*)cmp;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_short *const m = (cl_short *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000) ? y[i] : x[i];
 }
 
-void vrefselect_1i32(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i32(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_int *d, *x, *y, *m;
-    d = (cl_int*) dest;
-    x = (cl_int*) src1;
-    y = (cl_int*) src2;
-    m = (cl_int*) cmp;
+    cl_int *const d = (cl_int *)dest;
+    const cl_int *const x = (cl_int *)src1;
+    const cl_int *const y = (cl_int *)src2;
+    const cl_int *const m = (cl_int *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000) ? y[i] : x[i];
 }
 
-void vrefselect_1u32(void *dest, void *src1, void *src2, void *cmp, size_t count){
+void vrefselect_1u32(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y;
-    cl_int *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_int*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_int *const m = (cl_int *)cmp;
 
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000) ? y[i] : x[i];
 }
 
-void vrefselect_1i64(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i64(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y, *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_long*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000LL) ? y[i] : x[i];
 }
 
-void vrefselect_1u64(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u64(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y;
-    cl_long *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_long*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000LL) ? y[i] : x[i];
 }
 
-void vrefselect_1i8u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i8u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_char *d, *x, *y;
-    cl_uchar *m;
-    d = (cl_char*) dest;
-    x = (cl_char*) src1;
-    y = (cl_char*) src2;
-    m = (cl_uchar*) cmp;
+    cl_char *const d = (cl_char *)dest;
+    const cl_char *const x = (cl_char *)src1;
+    const cl_char *const y = (cl_char *)src2;
+    const cl_uchar *const m = (cl_uchar *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80U) ? y[i] : x[i];
 }
 
-void vrefselect_1u8u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u8u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_uchar *d, *x, *y, *m;
-    d = (cl_uchar*) dest;
-    x = (cl_uchar*) src1;
-    y = (cl_uchar*) src2;
-    m = (cl_uchar*) cmp;
+    cl_uchar *const d = (cl_uchar *)dest;
+    const cl_uchar *const x = (cl_uchar *)src1;
+    const cl_uchar *const y = (cl_uchar *)src2;
+    const cl_uchar *const m = (cl_uchar *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80U) ? y[i] : x[i];
 }
 
-void vrefselect_1i16u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i16u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_short *d, *x, *y;
-    cl_ushort *m;
-    d = (cl_short*) dest;
-    x = (cl_short*) src1;
-    y = (cl_short*) src2;
-    m = (cl_ushort*) cmp;
+    cl_short *const d = (cl_short *)dest;
+    const cl_short *const x = (cl_short *)src1;
+    const cl_short *const y = (cl_short *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000U) ? y[i] : x[i];
 }
 
-void vrefselect_1u16u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u16u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_ushort *d, *x, *y, *m;
-    d = (cl_ushort*) dest;
-    x = (cl_ushort*) src1;
-    y = (cl_ushort*) src2;
-    m = (cl_ushort*) cmp;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000U) ? y[i] : x[i];
 }
 
-void vrefselect_1i32u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i32u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_int *d, *x, *y;
-    cl_uint *m;
-    d = (cl_int*) dest;
-    x = (cl_int*) src1;
-    y = (cl_int*) src2;
-    m = (cl_uint*) cmp;
+    cl_int *const d = (cl_int *)dest;
+    const cl_int *const x = (cl_int *)src1;
+    const cl_int *const y = (cl_int *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000U) ? y[i] : x[i];
 }
 
-void vrefselect_1u32u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u32u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y, *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_uint*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000U) ? y[i] : x[i];
 }
 
-void vrefselect_1i64u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i64u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y;
-    cl_ulong *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_ulong*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000ULL) ? y[i] : x[i];
 }
 
-void vrefselect_1u64u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u64u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y, *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_ulong*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000ULL) ? y[i] : x[i];
 }
 
-void vrefselect_ffi(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_hhi(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y;
-    cl_int *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_int*) cmp;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_short *const m = (cl_short *)cmp;
+    for (i = 0; i < count; ++i) d[i] = (m[i] & 0x8000) ? y[i] : x[i];
+}
+
+void vrefselect_hhu(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
+    size_t i;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
+    for (i = 0; i < count; ++i) d[i] = (m[i] & 0x8000U) ? y[i] : x[i];
+}
+
+void vrefselect_ffi(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
+    size_t i;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_int *const m = (cl_int *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000) ? y[i] : x[i];
 }
 
-void vrefselect_ffu(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_ffu(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y;
-    cl_uint *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_uint*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000U) ? y[i] : x[i];
 }
 
-void vrefselect_ddi(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_ddi(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y;
-    cl_long *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_long*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000LL) ? y[i] : x[i];
 }
 
-void vrefselect_ddu(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_ddu(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y;
-    cl_ulong *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_ulong*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000ULL) ? y[i] : x[i];
 }
 
 // Define refSelects
-Select refSelects[kTypeCount][2] =  {
-    { refselect_1u8u,  refselect_1u8  }, // cl_uchar
-    { refselect_1i8u,  refselect_1i8  }, // char
+Select refSelects[kTypeCount][2] = {
+    { refselect_1u8u, refselect_1u8 }, // cl_uchar
+    { refselect_1i8u, refselect_1i8 }, // char
     { refselect_1u16u, refselect_1u16 }, // ushort
     { refselect_1i16u, refselect_1i16 }, // short
+    { refselect_hhu, refselect_hhi }, // half
     { refselect_1u32u, refselect_1u32 }, // uint
     { refselect_1i32u, refselect_1i32 }, // int
-    { refselect_ffu,   refselect_ffi  }, // float
+    { refselect_ffu, refselect_ffi }, // float
     { refselect_1u64u, refselect_1u64 }, // ulong
     { refselect_1i64u, refselect_1i64 }, // long
-    { refselect_ddu,   refselect_ddi }   // double
+    { refselect_ddu, refselect_ddi } // double
 };
 
 // Define vrefSelects (vector refSelects)
-Select vrefSelects[kTypeCount][2] =  {
-    { vrefselect_1u8u,  vrefselect_1u8  }, // cl_uchar
-    { vrefselect_1i8u,  vrefselect_1i8  }, // char
+Select vrefSelects[kTypeCount][2] = {
+    { vrefselect_1u8u, vrefselect_1u8 }, // cl_uchar
+    { vrefselect_1i8u, vrefselect_1i8 }, // char
     { vrefselect_1u16u, vrefselect_1u16 }, // ushort
     { vrefselect_1i16u, vrefselect_1i16 }, // short
+    { vrefselect_hhu, vrefselect_hhi }, // half
     { vrefselect_1u32u, vrefselect_1u32 }, // uint
     { vrefselect_1i32u, vrefselect_1i32 }, // int
-    { vrefselect_ffu,   vrefselect_ffi  }, // float
+    { vrefselect_ffu, vrefselect_ffi }, // float
     { vrefselect_1u64u, vrefselect_1u64 }, // ulong
     { vrefselect_1i64u, vrefselect_1i64 }, // long
-    { vrefselect_ddu,   vrefselect_ddi  }     // double
+    { vrefselect_ddu, vrefselect_ddi } // double
 };
 
 
 //-----------------------------------------
 // Check functions
 //-----------------------------------------
-size_t check_uchar(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_uchar *t = (const cl_uchar *) test;
-    const cl_uchar *c = (const cl_uchar *) correct;
+size_t check_uchar(const void *const test, const void *const correct,
+                   size_t count, size_t vector_size)
+{
+    const cl_uchar *const t = (const cl_uchar *)test;
+    const cl_uchar *const c = (const cl_uchar *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -566,8 +649,8 @@
         for (i = 0; i < count; i++)
             if (t[i] != c[i])
             {
-                log_error("\n(check_uchar) Error for vector size %ld found at "
-                          "0x%8.8lx (of 0x%8.8lx):  "
+                log_error("\n(check_uchar) Error for vector size %zu found at "
+                          "0x%8.8zx (of 0x%8.8zx):  "
                           "*0x%2.2x vs 0x%2.2x\n",
                           vector_size, i, count, c[i], t[i]);
                 return i + 1;
@@ -576,9 +659,11 @@
     return 0;
 }
 
-size_t check_char(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_char *t = (const cl_char *) test;
-    const cl_char *c = (const cl_char *) correct;
+size_t check_char(const void *const test, const void *const correct,
+                  size_t count, size_t vector_size)
+{
+    const cl_char *const t = (const cl_char *)test;
+    const cl_char *const c = (const cl_char *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -586,8 +671,8 @@
         for (i = 0; i < count; i++)
             if (t[i] != c[i])
             {
-                log_error("\n(check_char) Error for vector size %ld found at "
-                          "0x%8.8lx (of 0x%8.8lx):  "
+                log_error("\n(check_char) Error for vector size %zu found at "
+                          "0x%8.8zx (of 0x%8.8zx):  "
                           "*0x%2.2x vs 0x%2.2x\n",
                           vector_size, i, count, c[i], t[i]);
                 return i + 1;
@@ -597,9 +682,11 @@
     return 0;
 }
 
-size_t check_ushort(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_ushort *t = (const cl_ushort *) test;
-    const cl_ushort *c = (const cl_ushort *) correct;
+size_t check_ushort(const void *const test, const void *const correct,
+                    size_t count, size_t vector_size)
+{
+    const cl_ushort *const t = (const cl_ushort *)test;
+    const cl_ushort *const c = (const cl_ushort *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -607,8 +694,8 @@
         for (i = 0; i < count; i++)
             if (t[i] != c[i])
             {
-                log_error("\n(check_ushort) Error for vector size %ld found at "
-                          "0x%8.8lx (of 0x%8.8lx):  "
+                log_error("\n(check_ushort) Error for vector size %zu found at "
+                          "0x%8.8zx (of 0x%8.8zx):  "
                           "*0x%4.4x vs 0x%4.4x\n",
                           vector_size, i, count, c[i], t[i]);
                 return i + 1;
@@ -618,9 +705,11 @@
     return 0;
 }
 
-size_t check_short(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_short *t = (const cl_short *) test;
-    const cl_short *c = (const cl_short *) correct;
+size_t check_short(const void *const test, const void *const correct,
+                   size_t count, size_t vector_size)
+{
+    const cl_short *const t = (const cl_short *)test;
+    const cl_short *const c = (const cl_short *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -628,8 +717,8 @@
         for (i = 0; i < count; i++)
             if (t[i] != c[i])
             {
-                log_error("\n(check_short) Error for vector size %ld found at "
-                          "0x%8.8lx (of 0x%8.8lx):  "
+                log_error("\n(check_short) Error for vector size %zu found at "
+                          "0x%8.8zx (of 0x%8.8zx):  "
                           "*0x%8.8x vs 0x%8.8x\n",
                           vector_size, i, count, c[i], t[i]);
                 return i + 1;
@@ -639,9 +728,11 @@
     return 0;
 }
 
-size_t check_uint(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_uint *t = (const cl_uint *) test;
-    const cl_uint *c = (const cl_uint *) correct;
+size_t check_uint(const void *const test, const void *const correct,
+                  size_t count, size_t vector_size)
+{
+    const cl_uint *const t = (const cl_uint *)test;
+    const cl_uint *const c = (const cl_uint *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -649,8 +740,8 @@
         for (i = 0; i < count; i++)
             if (t[i] != c[i])
             {
-                log_error("\n(check_uint) Error for vector size %ld found at "
-                          "0x%8.8lx (of 0x%8.8lx):  "
+                log_error("\n(check_uint) Error for vector size %zu found at "
+                          "0x%8.8zx (of 0x%8.8zx):  "
                           "*0x%8.8x vs 0x%8.8x\n",
                           vector_size, i, count, c[i], t[i]);
                 return i + 1;
@@ -660,9 +751,11 @@
     return 0;
 }
 
-size_t check_int(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_int *t = (const cl_int *) test;
-    const cl_int *c = (const cl_int *) correct;
+size_t check_int(const void *const test, const void *const correct,
+                 size_t count, size_t vector_size)
+{
+    const cl_int *const t = (const cl_int *)test;
+    const cl_int *const c = (const cl_int *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -671,8 +764,8 @@
             if (t[i] != c[i])
             {
 
-                log_error("\n(check_int) Error for vector size %ld found at "
-                          "0x%8.8lx (of 0x%8.8lx):  "
+                log_error("\n(check_int) Error for vector size %zu found at "
+                          "0x%8.8zx (of 0x%8.8zx):  "
                           "*0x%8.8x vs 0x%8.8x\n",
                           vector_size, i, count, c[i], t[i]);
                 return i + 1;
@@ -682,9 +775,11 @@
     return 0;
 }
 
-size_t check_ulong(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_ulong *t = (const cl_ulong *) test;
-    const cl_ulong *c = (const cl_ulong *) correct;
+size_t check_ulong(const void *const test, const void *const correct,
+                   size_t count, size_t vector_size)
+{
+    const cl_ulong *const t = (const cl_ulong *)test;
+    const cl_ulong *const c = (const cl_ulong *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -692,9 +787,9 @@
         for (i = 0; i < count; i++)
             if (t[i] != c[i])
             {
-                log_error("\n(check_ulong) Error for vector size %ld found at "
-                          "0x%8.8lx (of 0x%8.8lx):  "
-                          "*0x%16.16llx vs 0x%16.16llx\n",
+                log_error("\n(check_ulong) Error for vector size %zu found at "
+                          "0x%8.8zx (of 0x%8.8zx):  "
+                          "*0x%16.16" PRIx64 " vs 0x%16.16" PRIx64 "\n",
                           vector_size, i, count, c[i], t[i]);
                 return i + 1;
             }
@@ -703,9 +798,11 @@
     return 0;
 }
 
-size_t check_long(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_long *t = (const cl_long *) test;
-    const cl_long *c = (const cl_long *) correct;
+size_t check_long(const void *const test, const void *const correct,
+                  size_t count, size_t vector_size)
+{
+    const cl_long *const t = (const cl_long *)test;
+    const cl_long *const c = (const cl_long *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -713,9 +810,9 @@
         for (i = 0; i < count; i++)
             if (t[i] != c[i])
             {
-                log_error("\n(check_long) Error for vector size %ld found at "
-                          "0x%8.8lx (of 0x%8.8lx):  "
-                          "*0x%16.16llx vs 0x%16.16llx\n",
+                log_error("\n(check_long) Error for vector size %zu found at "
+                          "0x%8.8zx (of 0x%8.8zx):  "
+                          "*0x%16.16" PRIx64 " vs 0x%16.16" PRIx64 "\n",
                           vector_size, i, count, c[i], t[i]);
                 return i + 1;
             }
@@ -724,9 +821,36 @@
     return 0;
 }
 
-size_t check_float( void *test, void *correct, size_t count, size_t vector_size ) {
-    const cl_uint *t = (const cl_uint *) test;
-    const cl_uint *c = (const cl_uint *) correct;
+size_t check_half(const void *const test, const void *const correct,
+                  size_t count, size_t vector_size)
+{
+    const cl_ushort *const t = (const cl_ushort *)test;
+    const cl_ushort *const c = (const cl_ushort *)correct;
+    size_t i;
+
+    if (memcmp(t, c, count * sizeof(c[0])) != 0)
+    {
+        for (i = 0; i < count; i++) /* Allow nans to be binary different */
+            if ((t[i] != c[i])
+                && !(isnan(((cl_half *)correct)[i])
+                     && isnan(((cl_half *)test)[i])))
+            {
+                log_error("\n(check_half) Error for vector size %zu found at "
+                          "0x%8.8zx (of 0x%8.8zx):  "
+                          "*0x%4.4x vs 0x%4.4x\n",
+                          vector_size, i, count, c[i], t[i]);
+                return i + 1;
+            }
+    }
+
+    return 0;
+}
+
+size_t check_float(const void *const test, const void *const correct,
+                   size_t count, size_t vector_size)
+{
+    const cl_uint *const t = (const cl_uint *)test;
+    const cl_uint *const c = (const cl_uint *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -735,8 +859,8 @@
             if ((t[i] != c[i])
                 && !(isnan(((float *)correct)[i]) && isnan(((float *)test)[i])))
             {
-                log_error("\n(check_float) Error for vector size %ld found at "
-                          "0x%8.8lx (of 0x%8.8lx):  "
+                log_error("\n(check_float) Error for vector size %zu found at "
+                          "0x%8.8zx (of 0x%8.8zx):  "
                           "*0x%8.8x vs 0x%8.8x\n",
                           vector_size, i, count, c[i], t[i]);
                 return i + 1;
@@ -746,9 +870,11 @@
     return 0;
 }
 
-size_t check_double( void *test, void *correct, size_t count, size_t vector_size ) {
-    const cl_ulong *t = (const cl_ulong *) test;
-    const cl_ulong *c = (const cl_ulong *) correct;
+size_t check_double(const void *const test, const void *const correct,
+                    size_t count, size_t vector_size)
+{
+    const cl_ulong *const t = (const cl_ulong *)test;
+    const cl_ulong *const c = (const cl_ulong *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -758,9 +884,9 @@
                 && !(isnan(((double *)correct)[i])
                      && isnan(((double *)test)[i])))
             {
-                log_error("\n(check_double) Error for vector size %ld found at "
-                          "0x%8.8lx (of 0x%8.8lx):  "
-                          "*0x%16.16llx vs 0x%16.16llx\n",
+                log_error("\n(check_double) Error for vector size %zu found at "
+                          "0x%8.8zx (of 0x%8.8zx):  "
+                          "*0x%16.16" PRIx64 " vs 0x%16.16" PRIx64 "\n",
                           vector_size, i, count, c[i], t[i]);
                 return i + 1;
             }
@@ -770,5 +896,7 @@
 }
 
 CheckResults checkResults[kTypeCount] = {
-    check_uchar, check_char, check_ushort, check_short, check_uint,
-    check_int, check_float, check_ulong, check_long, check_double };
+    check_uchar, check_char, check_ushort, check_short,
+    check_half,  check_uint, check_int,    check_float,
+    check_ulong, check_long, check_double
+};
diff --git a/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm32 b/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm32
new file mode 100644
index 0000000..4912718
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm32
@@ -0,0 +1,35 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 17
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Float16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "op_neg_half" %gl_GlobalInvocationID
+               OpName %in "in"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %in FuncParamAttr NoCapture
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+      %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %10 = OpTypeFunction %void %_ptr_CrossWorkgroup_half
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %1 = OpFunction %void None %10
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+         %11 = OpLabel
+         %12 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 0
+         %13 = OpCompositeExtract %uint %12 0
+         %14 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %13
+         %15 = OpLoad %half %14
+         %16 = OpFNegate %half %15
+               OpStore %14 %16
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm64 b/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm64
new file mode 100644
index 0000000..9c7e3d6
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm64
@@ -0,0 +1,39 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 20
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Float16
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %1 "op_neg_half" %gl_GlobalInvocationID
+               OpName %in "in"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %in FuncParamAttr NoCapture
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+      %ulong = OpTypeInt 64 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+       %void = OpTypeVoid
+      %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %10 = OpTypeFunction %void %_ptr_CrossWorkgroup_half
+   %ulong_32 = OpConstant %ulong 32
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+          %1 = OpFunction %void None %10
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+         %12 = OpLabel
+         %13 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 0
+         %14 = OpCompositeExtract %ulong %13 0
+         %15 = OpShiftLeftLogical %ulong %14 %ulong_32
+         %16 = OpShiftRightArithmetic %ulong %15 %ulong_32
+         %17 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %16
+         %18 = OpLoad %half %17
+         %19 = OpFNegate %half %18
+               OpStore %17 %19
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/vector_half8_extract.spvasm32 b/test_conformance/spirv_new/spirv_asm/vector_half8_extract.spvasm32
new file mode 100644
index 0000000..985b526
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/vector_half8_extract.spvasm32
@@ -0,0 +1,42 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 22
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Vector16
+               OpCapability Float16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "vector_half8_extract" %gl_GlobalInvocationID
+               OpName %in "in"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %in FuncParamAttr NoCapture
+               OpDecorate %4 FuncParamAttr NoCapture
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+      %half = OpTypeFloat 16
+    %v8half = OpTypeVector %half 4
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+%_ptr_CrossWorkgroup_v8half = OpTypePointer CrossWorkgroup %v8half
+         %13 = OpTypeFunction %void %_ptr_CrossWorkgroup_v8half %_ptr_CrossWorkgroup_half %uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %1 = OpFunction %void None %13
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_v8half
+          %4 = OpFunctionParameter %_ptr_CrossWorkgroup_half
+         %14 = OpFunctionParameter %uint
+         %15 = OpLabel
+         %16 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 0
+         %17 = OpCompositeExtract %uint %16 0
+         %18 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v8half %in %17
+         %19 = OpLoad %v8half %18
+         %20 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %4 %17
+         %21 = OpVectorExtractDynamic %half %19 %14
+               OpStore %20 %21
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/vector_half8_extract.spvasm64 b/test_conformance/spirv_new/spirv_asm/vector_half8_extract.spvasm64
new file mode 100644
index 0000000..dd14f66
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/vector_half8_extract.spvasm64
@@ -0,0 +1,47 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 26
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Vector16
+               OpCapability Float16
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %1 "vector_half8_extract" %gl_GlobalInvocationID
+               OpName %in "in"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %in FuncParamAttr NoCapture
+               OpDecorate %4 FuncParamAttr NoCapture
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+      %ulong = OpTypeInt 64 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+   %ulong_32 = OpConstant %ulong 32
+       %uint = OpTypeInt 32 0
+       %void = OpTypeVoid
+      %half = OpTypeFloat 16
+    %v8half = OpTypeVector %half 8
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+%_ptr_CrossWorkgroup_v8half = OpTypePointer CrossWorkgroup %v8half
+         %15 = OpTypeFunction %void %_ptr_CrossWorkgroup_v8half %_ptr_CrossWorkgroup_half %uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+          %1 = OpFunction %void None %15
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_v8half
+          %4 = OpFunctionParameter %_ptr_CrossWorkgroup_half
+         %16 = OpFunctionParameter %uint
+         %17 = OpLabel
+         %18 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 0
+         %19 = OpCompositeExtract %ulong %18 0
+         %20 = OpShiftLeftLogical %ulong %19 %ulong_32
+         %21 = OpShiftRightArithmetic %ulong %20 %ulong_32
+         %22 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v8half %in %21
+         %23 = OpLoad %v8half %22
+         %24 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %4 %21
+         %25 = OpVectorExtractDynamic %half %23 %16
+               OpStore %24 %25
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/vector_half8_insert.spvasm32 b/test_conformance/spirv_new/spirv_asm/vector_half8_insert.spvasm32
new file mode 100644
index 0000000..2781293
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/vector_half8_insert.spvasm32
@@ -0,0 +1,43 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 23
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Vector16
+               OpCapability Float16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "vector_half8_insert" %gl_GlobalInvocationID
+               OpName %in "in"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %in FuncParamAttr NoCapture
+               OpDecorate %4 FuncParamAttr NoCapture
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+      %half = OpTypeFloat 16
+    %v8half = OpTypeVector %half 8
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+%_ptr_CrossWorkgroup_v8half = OpTypePointer CrossWorkgroup %v8half
+         %13 = OpTypeFunction %void %_ptr_CrossWorkgroup_half %_ptr_CrossWorkgroup_v8half %uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %1 = OpFunction %void None %13
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+          %4 = OpFunctionParameter %_ptr_CrossWorkgroup_v8half
+         %14 = OpFunctionParameter %uint
+         %15 = OpLabel
+         %16 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 0
+         %17 = OpCompositeExtract %uint %16 0
+         %18 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %17
+         %19 = OpLoad %half %18
+         %20 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v8half %4 %17
+         %21 = OpLoad %v8half %20
+         %22 = OpVectorInsertDynamic %v8half %21 %19 %14
+               OpStore %20 %22
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/vector_half8_insert.spvasm64 b/test_conformance/spirv_new/spirv_asm/vector_half8_insert.spvasm64
new file mode 100644
index 0000000..f140fc2
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/vector_half8_insert.spvasm64
@@ -0,0 +1,48 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 27
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Vector16
+               OpCapability Float16
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %1 "vector_half8_insert" %gl_GlobalInvocationID
+               OpName %in "in"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %in FuncParamAttr NoCapture
+               OpDecorate %4 FuncParamAttr NoCapture
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+      %ulong = OpTypeInt 64 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+   %ulong_32 = OpConstant %ulong 32
+       %uint = OpTypeInt 32 0
+       %void = OpTypeVoid
+      %half = OpTypeFloat 16
+    %v8half = OpTypeVector %half 8
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+%_ptr_CrossWorkgroup_v8half = OpTypePointer CrossWorkgroup %v8half
+         %15 = OpTypeFunction %void %_ptr_CrossWorkgroup_half %_ptr_CrossWorkgroup_v8half %uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+          %1 = OpFunction %void None %15
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+          %4 = OpFunctionParameter %_ptr_CrossWorkgroup_v8half
+         %16 = OpFunctionParameter %uint
+         %17 = OpLabel
+         %18 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 0
+         %19 = OpCompositeExtract %ulong %18 0
+         %20 = OpShiftLeftLogical %ulong %19 %ulong_32
+         %21 = OpShiftRightArithmetic %ulong %20 %ulong_32
+         %22 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %21
+         %23 = OpLoad %half %22
+         %24 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v8half %4 %21
+         %25 = OpLoad %v8half %24
+         %26 = OpVectorInsertDynamic %v8half %25 %23 %16
+               OpStore %24 %26
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm32 b/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm32
new file mode 100644
index 0000000..6fda7d8
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm32
@@ -0,0 +1,46 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 25
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Float16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "vector_times_scalar" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %lhs "lhs"
+               OpName %rhs "rhs"
+               OpDecorate %5 FuncParamAttr NoCapture
+          %5 = OpDecorationGroup
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpGroupDecorate %5 %res %lhs %rhs
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+     %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+   %v4half = OpTypeVector %half 4
+%_ptr_CrossWorkgroup_v4half = OpTypePointer CrossWorkgroup %v4half
+         %15 = OpTypeFunction %void %_ptr_CrossWorkgroup_v4half %_ptr_CrossWorkgroup_v4half %_ptr_CrossWorkgroup_half
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %1 = OpFunction %void None %15
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_v4half
+        %lhs = OpFunctionParameter %_ptr_CrossWorkgroup_v4half
+        %rhs = OpFunctionParameter %_ptr_CrossWorkgroup_half
+         %16 = OpLabel
+         %17 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 0
+         %18 = OpCompositeExtract %uint %17 0
+         %19 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v4half %lhs %18
+         %20 = OpLoad %v4half %19 Aligned 8
+         %21 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %rhs %18
+         %22 = OpLoad %half %21 Aligned 2
+         %23 = OpVectorTimesScalar %v4half %20 %22
+         %24 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v4half %res %18
+               OpStore %24 %23 Aligned 8
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm64 b/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm64
new file mode 100644
index 0000000..fa2d522
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm64
@@ -0,0 +1,50 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 28
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Float16
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %1 "vector_times_scalar" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %lhs "lhs"
+               OpName %rhs "rhs"
+               OpDecorate %5 FuncParamAttr NoCapture
+          %5 = OpDecorationGroup
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpGroupDecorate %5 %res %lhs %rhs
+      %ulong = OpTypeInt 64 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+   %ulong_32 = OpConstant %ulong 32
+       %void = OpTypeVoid
+     %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+   %v4half = OpTypeVector %half 4
+%_ptr_CrossWorkgroup_v4half = OpTypePointer CrossWorkgroup %v4half
+         %16 = OpTypeFunction %void %_ptr_CrossWorkgroup_v4half %_ptr_CrossWorkgroup_v4half %_ptr_CrossWorkgroup_half
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+          %1 = OpFunction %void None %16
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_v4half
+        %lhs = OpFunctionParameter %_ptr_CrossWorkgroup_v4half
+        %rhs = OpFunctionParameter %_ptr_CrossWorkgroup_half
+         %17 = OpLabel
+         %18 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 0
+         %19 = OpCompositeExtract %ulong %18 0
+         %20 = OpShiftLeftLogical %ulong %19 %ulong_32
+         %21 = OpShiftRightArithmetic %ulong %20 %ulong_32
+         %22 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v4half %lhs %21
+         %23 = OpLoad %v4half %22 Aligned 8
+         %24 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %rhs %21
+         %25 = OpLoad %half %24 Aligned 2
+         %26 = OpVectorTimesScalar %v4half %23 %25
+         %27 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v4half %res %21
+               OpStore %27 %26 Aligned 8
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/test_op_negate.cpp b/test_conformance/spirv_new/test_op_negate.cpp
index e3dc1f3..5009be9 100644
--- a/test_conformance/spirv_new/test_op_negate.cpp
+++ b/test_conformance/spirv_new/test_op_negate.cpp
@@ -32,6 +32,15 @@
             return 0;
         }
     }
+    if (std::string(Tname).find("half") != std::string::npos)
+    {
+        if (!is_extension_available(deviceID, "cl_khr_fp16"))
+        {
+            log_info(
+                "Extension cl_khr_fp16 not supported; skipping half tests.\n");
+            return 0;
+        }
+    }
 
     cl_int err = CL_SUCCESS;
     int num = (int)h_in.size();
@@ -73,29 +82,28 @@
     return 0;
 }
 
-#define TEST_NEGATION(TYPE, Tv, OP, FUNC)       \
-    TEST_SPIRV_FUNC(OP##_##TYPE)                \
-    {                                           \
-        int num = 1 << 20;                      \
-        std::vector<Tv> in(num);                \
-        RandomSeed seed(gRandomSeed);           \
-        for (int i = 0; i < num; i++) {         \
-            in[i] = genrand<Tv>(seed);          \
-        }                                       \
-        return test_negation<Tv>(deviceID,      \
-                                 context,       \
-                                 queue,         \
-                                 #TYPE,         \
-                                 #OP,           \
-                                 in, FUNC);     \
-    }                                           \
+#define TEST_NEGATION(TYPE, Tv, OP, FUNC)                                      \
+    TEST_SPIRV_FUNC(OP##_##TYPE)                                               \
+    {                                                                          \
+        int num = 1 << 20;                                                     \
+        std::vector<Tv> in(num);                                               \
+        RandomSeed seed(gRandomSeed);                                          \
+        for (int i = 0; i < num; i++)                                          \
+        {                                                                      \
+            in[i] = genrand<Tv>(seed);                                         \
+        }                                                                      \
+        return test_negation<Tv>(deviceID, context, queue, #TYPE, #OP, in,     \
+                                 FUNC);                                        \
+    }
 
 
+#define TEST_NEG_HALF TEST_NEGATION(half, cl_half, op_neg, negOpHalf)
 #define TEST_NEG(TYPE)        TEST_NEGATION(TYPE, cl_##TYPE, op_neg, negOp<cl_##TYPE>)
 #define TEST_NOT(TYPE)        TEST_NEGATION(TYPE, cl_##TYPE, op_not, notOp<cl_##TYPE>)
 #define TEST_NEG_VEC(TYPE, N) TEST_NEGATION(TYPE##N, cl_##TYPE##N, op_neg, (negOpVec<cl_##TYPE##N, N>))
 #define TEST_NOT_VEC(TYPE, N) TEST_NEGATION(TYPE##N, cl_##TYPE##N, op_not, (notOpVec<cl_##TYPE##N, N>))
 
+TEST_NEG_HALF
 TEST_NEG(float)
 TEST_NEG(double)
 TEST_NEG(int)
diff --git a/test_conformance/spirv_new/test_op_vector_extract.cpp b/test_conformance/spirv_new/test_op_vector_extract.cpp
index fe1f825..f77aa7a 100644
--- a/test_conformance/spirv_new/test_op_vector_extract.cpp
+++ b/test_conformance/spirv_new/test_op_vector_extract.cpp
@@ -25,6 +25,17 @@
             return 0;
         }
     }
+
+    if (std::string(name).find("half") != std::string::npos)
+    {
+        if (!is_extension_available(deviceID, "cl_khr_fp16"))
+        {
+            log_info(
+                "Extension cl_khr_fp16 not supported; skipping half tests.\n");
+            return 0;
+        }
+    }
+
     cl_int err = CL_SUCCESS;
 
     clProgramWrapper prog;
@@ -76,27 +87,30 @@
     return 0;
 }
 
-#define TEST_VECTOR_EXTRACT(TYPE, N)                        \
-    TEST_SPIRV_FUNC(op_vector_##TYPE##N##_extract)          \
-    {                                                       \
-        typedef cl_##TYPE##N Tv;                            \
-        typedef cl_##TYPE Ts;                               \
-        const int num = 1 << 20;                            \
-        std::vector<Tv> in(num);                            \
-        const char *name = "vector_" #TYPE #N "_extract";   \
-                                                            \
-        RandomSeed seed(gRandomSeed);                       \
-                                                            \
-        for (int i = 0; i < num; i++) {                     \
-            in[i] = genrand<Tv>(seed);                      \
-        }                                                   \
-                                                            \
-        return test_extract<Tv, Ts>(deviceID,               \
-                                    context, queue,         \
-                                    name,                   \
-                                    in, N);                 \
+#define TEST_VECTOR_EXTRACT(TYPE, N)                                           \
+    TEST_SPIRV_FUNC(op_vector_##TYPE##N##_extract)                             \
+    {                                                                          \
+        if (sizeof(cl_##TYPE) == 2)                                            \
+        {                                                                      \
+            PASSIVE_REQUIRE_FP16_SUPPORT(deviceID);                            \
+        }                                                                      \
+        typedef cl_##TYPE##N Tv;                                               \
+        typedef cl_##TYPE Ts;                                                  \
+        const int num = 1 << 20;                                               \
+        std::vector<Tv> in(num);                                               \
+        const char *name = "vector_" #TYPE #N "_extract";                      \
+                                                                               \
+        RandomSeed seed(gRandomSeed);                                          \
+                                                                               \
+        for (int i = 0; i < num; i++)                                          \
+        {                                                                      \
+            in[i] = genrand<Tv>(seed);                                         \
+        }                                                                      \
+                                                                               \
+        return test_extract<Tv, Ts>(deviceID, context, queue, name, in, N);    \
     }
 
+TEST_VECTOR_EXTRACT(half, 8)
 TEST_VECTOR_EXTRACT(int, 4)
 TEST_VECTOR_EXTRACT(float, 4)
 TEST_VECTOR_EXTRACT(long, 2)
diff --git a/test_conformance/spirv_new/test_op_vector_insert.cpp b/test_conformance/spirv_new/test_op_vector_insert.cpp
index 0749c14..62fc78c 100644
--- a/test_conformance/spirv_new/test_op_vector_insert.cpp
+++ b/test_conformance/spirv_new/test_op_vector_insert.cpp
@@ -25,6 +25,17 @@
             return 0;
         }
     }
+
+    if (std::string(name).find("half") != std::string::npos)
+    {
+        if (!is_extension_available(deviceID, "cl_khr_fp16"))
+        {
+            log_info(
+                "Extension cl_khr_fp16 not supported; skipping half tests.\n");
+            return 0;
+        }
+    }
+
     cl_int err = CL_SUCCESS;
     clProgramWrapper prog;
     err = get_program_with_il(prog, deviceID, context, name);
@@ -94,27 +105,30 @@
     return 0;
 }
 
-#define TEST_VECTOR_INSERT(TYPE, N)                         \
-    TEST_SPIRV_FUNC(op_vector_##TYPE##N##_insert)           \
-    {                                                       \
-        typedef cl_##TYPE##N Tv;                            \
-        typedef cl_##TYPE Ts;                               \
-        const int num = 1 << 20;                            \
-        std::vector<Ts> in(num);                            \
-        const char *name = "vector_" #TYPE #N "_insert";    \
-                                                            \
-        RandomSeed seed(gRandomSeed);                       \
-                                                            \
-        for (int i = 0; i < num; i++) {                     \
-            in[i] = genrand<Ts>(seed);                      \
-        }                                                   \
-                                                            \
-        return test_insert<Ts, Tv>(deviceID,                \
-                                   context, queue,          \
-                                   name,                    \
-                                   in, N);                  \
+#define TEST_VECTOR_INSERT(TYPE, N)                                            \
+    TEST_SPIRV_FUNC(op_vector_##TYPE##N##_insert)                              \
+    {                                                                          \
+        if (sizeof(cl_##TYPE) == 2)                                            \
+        {                                                                      \
+            PASSIVE_REQUIRE_FP16_SUPPORT(deviceID);                            \
+        }                                                                      \
+        typedef cl_##TYPE##N Tv;                                               \
+        typedef cl_##TYPE Ts;                                                  \
+        const int num = 1 << 20;                                               \
+        std::vector<Ts> in(num);                                               \
+        const char *name = "vector_" #TYPE #N "_insert";                       \
+                                                                               \
+        RandomSeed seed(gRandomSeed);                                          \
+                                                                               \
+        for (int i = 0; i < num; i++)                                          \
+        {                                                                      \
+            in[i] = genrand<Ts>(seed);                                         \
+        }                                                                      \
+                                                                               \
+        return test_insert<Ts, Tv>(deviceID, context, queue, name, in, N);     \
     }
 
+TEST_VECTOR_INSERT(half, 8)
 TEST_VECTOR_INSERT(int, 4)
 TEST_VECTOR_INSERT(float, 4)
 TEST_VECTOR_INSERT(long, 2)
diff --git a/test_conformance/spirv_new/test_op_vector_times_scalar.cpp b/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
index 0859668..0be4e8b 100644
--- a/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
+++ b/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
@@ -17,6 +17,8 @@
 #include <sstream>
 #include <string>
 
+using half = cl_half;
+
 template<typename Tv, typename Ts>
 int test_vector_times_scalar(cl_device_id deviceID,
                              cl_context context,
@@ -32,6 +34,16 @@
         }
     }
 
+    if (std::string(Tname).find("half") != std::string::npos)
+    {
+        if (!is_extension_available(deviceID, "cl_khr_fp16"))
+        {
+            log_info("Extension cl_khr_fp16 not supported; skipping half "
+                     "tests.\n");
+            return 0;
+        }
+    }
+
     cl_int err = CL_SUCCESS;
     int num = (int)h_lhs.size();
     size_t lhs_bytes = num * sizeof(Tv);
@@ -171,5 +183,7 @@
                                                 lhs, rhs);      \
     }
 
+
 TEST_VECTOR_TIMES_SCALAR(float, 4)
 TEST_VECTOR_TIMES_SCALAR(double, 4)
+TEST_VECTOR_TIMES_SCALAR(half, 4)
diff --git a/test_conformance/spirv_new/types.hpp b/test_conformance/spirv_new/types.hpp
index e7fceba..728b244 100644
--- a/test_conformance/spirv_new/types.hpp
+++ b/test_conformance/spirv_new/types.hpp
@@ -43,6 +43,8 @@
 VEC_NOT_EQ_FUNC(cl_float, 4)
 VEC_NOT_EQ_FUNC(cl_double, 2)
 VEC_NOT_EQ_FUNC(cl_double, 4)
+VEC_NOT_EQ_FUNC(cl_half, 2)
+VEC_NOT_EQ_FUNC(cl_half, 4)
 
 template<typename T>
 bool isNotEqual(const T &lhs, const T &rhs)
@@ -109,6 +111,9 @@
 GENRAND_REAL_FUNC(cl_float, 4)
 GENRAND_REAL_FUNC(cl_double, 2)
 GENRAND_REAL_FUNC(cl_double, 4)
+GENRAND_REAL_FUNC(cl_half, 2)
+GENRAND_REAL_FUNC(cl_half, 4)
+GENRAND_REAL_FUNC(cl_half, 8)
 
 template<> inline cl_half genrandReal<cl_half>(RandomSeed &seed)
 {
@@ -157,6 +162,8 @@
     return -in;
 }
 
+inline cl_half negOpHalf(cl_half v) { return v ^ 0x8000; }
+
 template<typename Tv>
 Tv notOp(Tv in)
 {
diff --git a/test_conformance/subgroups/subgroup_common_templates.h b/test_conformance/subgroups/subgroup_common_templates.h
index f779ef3..d9dfc3b 100644
--- a/test_conformance/subgroups/subgroup_common_templates.h
+++ b/test_conformance/subgroups/subgroup_common_templates.h
@@ -483,29 +483,30 @@
     static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
                            const WorkGroupParams &test_params)
     {
-        int ii, i, j, k, n;
+        int ii, k;
+        size_t n;
         cl_uint l;
-        int nw = test_params.local_workgroup_size;
-        int ns = test_params.subgroup_size;
+        size_t nw = test_params.local_workgroup_size;
+        size_t ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        int nj = (nw + ns - 1) / ns;
+        size_t nj = (nw + ns - 1) / ns;
         Ty tr, rr;
         ng = ng / nw;
 
         for (k = 0; k < ng; ++k)
         { // for each work_group
-            for (j = 0; j < nw; ++j)
+            for (size_t j = 0; j < nw; ++j)
             { // inside the work_group
                 mx[j] = x[j]; // read host inputs for work_group
                 my[j] = y[j]; // read device outputs for work_group
             }
 
-            for (j = 0; j < nj; ++j)
+            for (size_t j = 0; j < nj; ++j)
             { // for each subgroup
                 ii = j * ns;
                 n = ii + ns > nw ? nw - ii : ns;
 
-                for (i = 0; i < n; ++i)
+                for (size_t i = 0; i < n; ++i)
                 { // inside the subgroup
                   // shuffle index storage
                     int midx = 4 * ii + 4 * i + 2;
diff --git a/test_conformance/subgroups/subhelpers.cpp b/test_conformance/subgroups/subhelpers.cpp
index 11268f6..440cde2 100644
--- a/test_conformance/subgroups/subhelpers.cpp
+++ b/test_conformance/subgroups/subhelpers.cpp
@@ -206,7 +206,7 @@
 }
 
 void fill_and_shuffle_safe_values(std::vector<cl_ulong> &safe_values,
-                                  int sb_size)
+                                  size_t sb_size)
 {
     // max product is 720, cl_half has enough precision for it
     const std::vector<cl_ulong> non_one_values{ 2, 3, 4, 5, 6 };
diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h
index bcb523c..ed92e5d 100644
--- a/test_conformance/subgroups/subhelpers.h
+++ b/test_conformance/subgroups/subhelpers.h
@@ -44,7 +44,7 @@
 // for each subgroup values defined different values
 // for rest of workitems set 1 shuffle values
 void fill_and_shuffle_safe_values(std::vector<cl_ulong> &safe_values,
-                                  int sb_size);
+                                  size_t sb_size);
 
 struct WorkGroupParams
 {
diff --git a/test_conformance/subgroups/test_workitem.cpp b/test_conformance/subgroups/test_workitem.cpp
index b69f313..5b2a5eb 100644
--- a/test_conformance/subgroups/test_workitem.cpp
+++ b/test_conformance/subgroups/test_workitem.cpp
@@ -36,7 +36,7 @@
 };
 
 static int check_group(const get_test_data *result, int nw, cl_uint ensg,
-                       int maxwgs)
+                       size_t maxwgs)
 {
     int first = -1;
     int last = -1;
@@ -168,7 +168,7 @@
 
         j = (result[first].subGroupSize + 31) / 32 * result[i].subGroupId
             + (result[i].subGroupLocalId >> 5);
-        if (j < sizeof(hit) / 4)
+        if (j < static_cast<int>(sizeof(hit) / 4))
         {
             cl_uint b = 1U << (result[i].subGroupLocalId & 0x1fU);
             if ((hit[j] & b) != 0)
@@ -191,7 +191,7 @@
     static const size_t lsize = 200;
     int error;
     int i, j, k, q, r, nw;
-    int maxwgs;
+    size_t maxwgs;
     cl_uint ensg;
     size_t global;
     size_t local;
@@ -235,7 +235,7 @@
     error = get_max_allowed_work_group_size(context, kernel, &local, NULL);
     if (error != 0) return error;
 
-    maxwgs = (int)local;
+    maxwgs = local;
 
     // Limit it a bit so we have muliple work groups
     // Ideally this will still be large enough to give us multiple subgroups
diff --git a/test_conformance/vulkan/main.cpp b/test_conformance/vulkan/main.cpp
index 5901420..eb1afeb 100644
--- a/test_conformance/vulkan/main.cpp
+++ b/test_conformance/vulkan/main.cpp
@@ -52,7 +52,8 @@
 }
 
 extern int test_buffer_common(cl_device_id device_, cl_context context_,
-                              cl_command_queue queue_, int numElements_);
+                              cl_command_queue queue_, int numElements_,
+                              float use_fence);
 extern int test_image_common(cl_device_id device_, cl_context context_,
                              cl_command_queue queue_, int numElements_);
 
@@ -61,7 +62,7 @@
 {
     params_reset();
     log_info("RUNNING TEST WITH ONE QUEUE...... \n\n");
-    return test_buffer_common(device_, context_, queue_, numElements_);
+    return test_buffer_common(device_, context_, queue_, numElements_, false);
 }
 int test_buffer_multiple_queue(cl_device_id device_, cl_context context_,
                                cl_command_queue queue_, int numElements_)
@@ -69,7 +70,7 @@
     params_reset();
     numCQ = 2;
     log_info("RUNNING TEST WITH TWO QUEUE...... \n\n");
-    return test_buffer_common(device_, context_, queue_, numElements_);
+    return test_buffer_common(device_, context_, queue_, numElements_, false);
 }
 int test_buffer_multiImport_sameCtx(cl_device_id device_, cl_context context_,
                                     cl_command_queue queue_, int numElements_)
@@ -78,7 +79,7 @@
     multiImport = true;
     log_info("RUNNING TEST WITH MULTIPLE DEVICE MEMORY IMPORT "
              "IN SAME CONTEXT...... \n\n");
-    return test_buffer_common(device_, context_, queue_, numElements_);
+    return test_buffer_common(device_, context_, queue_, numElements_, false);
 }
 int test_buffer_multiImport_diffCtx(cl_device_id device_, cl_context context_,
                                     cl_command_queue queue_, int numElements_)
@@ -88,7 +89,45 @@
     multiCtx = true;
     log_info("RUNNING TEST WITH MULTIPLE DEVICE MEMORY IMPORT "
              "IN DIFFERENT CONTEXT...... \n\n");
-    return test_buffer_common(device_, context_, queue_, numElements_);
+    return test_buffer_common(device_, context_, queue_, numElements_, false);
+}
+int test_buffer_single_queue_fence(cl_device_id device_, cl_context context_,
+                                   cl_command_queue queue_, int numElements_)
+{
+    params_reset();
+    log_info("RUNNING TEST WITH ONE QUEUE...... \n\n");
+    return test_buffer_common(device_, context_, queue_, numElements_, true);
+}
+int test_buffer_multiple_queue_fence(cl_device_id device_, cl_context context_,
+                                     cl_command_queue queue_, int numElements_)
+{
+    params_reset();
+    numCQ = 2;
+    log_info("RUNNING TEST WITH TWO QUEUE...... \n\n");
+    return test_buffer_common(device_, context_, queue_, numElements_, true);
+}
+int test_buffer_multiImport_sameCtx_fence(cl_device_id device_,
+                                          cl_context context_,
+                                          cl_command_queue queue_,
+                                          int numElements_)
+{
+    params_reset();
+    multiImport = true;
+    log_info("RUNNING TEST WITH MULTIPLE DEVICE MEMORY IMPORT "
+             "IN SAME CONTEXT...... \n\n");
+    return test_buffer_common(device_, context_, queue_, numElements_, true);
+}
+int test_buffer_multiImport_diffCtx_fence(cl_device_id device_,
+                                          cl_context context_,
+                                          cl_command_queue queue_,
+                                          int numElements_)
+{
+    params_reset();
+    multiImport = true;
+    multiCtx = true;
+    log_info("RUNNING TEST WITH MULTIPLE DEVICE MEMORY IMPORT "
+             "IN DIFFERENT CONTEXT...... \n\n");
+    return test_buffer_common(device_, context_, queue_, numElements_, true);
 }
 int test_image_single_queue(cl_device_id device_, cl_context context_,
                             cl_command_queue queue_, int numElements_)
@@ -110,6 +149,10 @@
                                 ADD_TEST(buffer_multiple_queue),
                                 ADD_TEST(buffer_multiImport_sameCtx),
                                 ADD_TEST(buffer_multiImport_diffCtx),
+                                ADD_TEST(buffer_single_queue_fence),
+                                ADD_TEST(buffer_multiple_queue_fence),
+                                ADD_TEST(buffer_multiImport_sameCtx_fence),
+                                ADD_TEST(buffer_multiImport_diffCtx_fence),
                                 ADD_TEST(image_single_queue),
                                 ADD_TEST(image_multiple_queue),
                                 ADD_TEST(consistency_external_buffer),
@@ -142,7 +185,6 @@
 bool useDeviceLocal = false;
 bool disableNTHandleType = false;
 bool enableOffset = false;
-bool non_dedicated = false;
 
 static void printUsage(const char *execName)
 {
@@ -189,10 +231,6 @@
             {
                 enableOffset = true;
             }
-            if (!strcmp(argv[i], "--non_dedicated"))
-            {
-                non_dedicated = true;
-            }
             if (strcmp(argv[i], "-h") == 0)
             {
                 printUsage(argv[0]);
diff --git a/test_conformance/vulkan/test_vulkan_api_consistency.cpp b/test_conformance/vulkan/test_vulkan_api_consistency.cpp
index f22ac31..d12b3bf 100644
--- a/test_conformance/vulkan/test_vulkan_api_consistency.cpp
+++ b/test_conformance/vulkan/test_vulkan_api_consistency.cpp
@@ -81,10 +81,11 @@
     const VulkanMemoryTypeList& memoryTypeList =
         vkDummyBuffer.getMemoryTypeList();
 
-    VulkanDeviceMemory* vkDeviceMem = new VulkanDeviceMemory(
-        vkDevice, bufferSize, memoryTypeList[0], vkExternalMemoryHandleType);
     VulkanBufferList vkBufferList(1, vkDevice, bufferSize,
                                   vkExternalMemoryHandleType);
+    VulkanDeviceMemory* vkDeviceMem =
+        new VulkanDeviceMemory(vkDevice, vkBufferList[0], memoryTypeList[0],
+                               vkExternalMemoryHandleType);
 
     vkDeviceMem->bindBuffer(vkBufferList[0], 0);
 
@@ -231,22 +232,27 @@
 
     VulkanExternalMemoryHandleType vkExternalMemoryHandleType =
         getSupportedVulkanExternalMemoryHandleTypeList()[0];
-    VulkanImage2D* vkImage2D =
-        new VulkanImage2D(vkDevice, VULKAN_FORMAT_R8G8B8A8_UNORM, width, height,
-                          1, vkExternalMemoryHandleType);
 
-    const VulkanMemoryTypeList& memoryTypeList = vkImage2D->getMemoryTypeList();
-    uint64_t totalImageMemSize = vkImage2D->getSize();
+    VulkanImageTiling vulkanImageTiling =
+        vkClExternalMemoryHandleTilingAssumption(
+            deviceID, vkExternalMemoryHandleType, &errNum);
+    ASSERT_SUCCESS(errNum, "Failed to query OpenCL tiling mode");
+
+    VulkanImage2D vkImage2D =
+        VulkanImage2D(vkDevice, VULKAN_FORMAT_R8G8B8A8_UNORM, width, height,
+                      vulkanImageTiling, 1, vkExternalMemoryHandleType);
+
+    const VulkanMemoryTypeList& memoryTypeList = vkImage2D.getMemoryTypeList();
+    uint64_t totalImageMemSize = vkImage2D.getSize();
 
     log_info("Memory type index: %lu\n", (uint32_t)memoryTypeList[0]);
     log_info("Memory type property: %d\n",
              memoryTypeList[0].getMemoryTypeProperty());
     log_info("Image size : %d\n", totalImageMemSize);
 
-    VulkanDeviceMemory* vkDeviceMem =
-        new VulkanDeviceMemory(vkDevice, totalImageMemSize, memoryTypeList[0],
-                               vkExternalMemoryHandleType);
-    vkDeviceMem->bindImage(*vkImage2D, 0);
+    VulkanDeviceMemory* vkDeviceMem = new VulkanDeviceMemory(
+        vkDevice, vkImage2D, memoryTypeList[0], vkExternalMemoryHandleType);
+    vkDeviceMem->bindImage(vkImage2D, 0);
 
     void* handle = NULL;
     int fd;
@@ -299,7 +305,7 @@
     extMemProperties.push_back(0);
 
     const VkImageCreateInfo VulkanImageCreateInfo =
-        vkImage2D->getVkImageCreateInfo();
+        vkImage2D.getVkImageCreateInfo();
 
     errNum = getCLImageInfoFromVkImageInfo(
         &VulkanImageCreateInfo, totalImageMemSize, &img_format, &image_desc);
diff --git a/test_conformance/vulkan/test_vulkan_interop_buffer.cpp b/test_conformance/vulkan/test_vulkan_interop_buffer.cpp
index 9b0bc9d..559625d 100644
--- a/test_conformance/vulkan/test_vulkan_interop_buffer.cpp
+++ b/test_conformance/vulkan/test_vulkan_interop_buffer.cpp
@@ -21,6 +21,7 @@
 #include <assert.h>
 #include <vector>
 #include <iostream>
+#include <memory>
 #include <string.h>
 #include "harness/errorHelpers.h"
 
@@ -82,7 +83,8 @@
 int run_test_with_two_queue(cl_context &context, cl_command_queue &cmd_queue1,
                             cl_command_queue &cmd_queue2, cl_kernel *kernel,
                             cl_kernel &verify_kernel, VulkanDevice &vkDevice,
-                            uint32_t numBuffers, uint32_t bufferSize)
+                            uint32_t numBuffers, uint32_t bufferSize,
+                            bool use_fence)
 {
     int err = CL_SUCCESS;
     size_t global_work_size[1];
@@ -117,14 +119,18 @@
         getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
     VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
     VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    std::shared_ptr<VulkanFence> fence = nullptr;
 
     VulkanQueue &vkQueue = vkDevice.getQueue();
 
     std::vector<char> vkBufferShader = readFile("buffer.spv");
 
     VulkanShaderModule vkBufferShaderModule(vkDevice, vkBufferShader);
-    VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList(
-        MAX_BUFFERS + 1, VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+    VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList;
+    vkDescriptorSetLayoutBindingList.addBinding(
+        0, VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1);
+    vkDescriptorSetLayoutBindingList.addBinding(
+        1, VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER, MAX_BUFFERS);
     VulkanDescriptorSetLayout vkDescriptorSetLayout(
         vkDevice, vkDescriptorSetLayoutBindingList);
     VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout);
@@ -136,10 +142,17 @@
     VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool,
                                         vkDescriptorSetLayout);
 
-    clVk2CLExternalSemaphore = new clExternalSemaphore(
-        vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
-    clCl2VkExternalSemaphore = new clExternalSemaphore(
-        vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    if (use_fence)
+    {
+        fence = std::make_shared<VulkanFence>(vkDevice);
+    }
+    else
+    {
+        clVk2CLExternalSemaphore = new clExternalSemaphore(
+            vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+        clCl2VkExternalSemaphore = new clExternalSemaphore(
+            vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    }
 
     const uint32_t maxIter = innerIterations;
     VulkanCommandPool vkCommandPool(vkDevice);
@@ -179,9 +192,9 @@
 
             for (size_t bIdx = 0; bIdx < numBuffers; bIdx++)
             {
-                vkBufferListDeviceMemory.push_back(
-                    new VulkanDeviceMemory(vkDevice, bufferSize, memoryType,
-                                           vkExternalMemoryHandleType));
+                vkBufferListDeviceMemory.push_back(new VulkanDeviceMemory(
+                    vkDevice, vkBufferList[bIdx], memoryType,
+                    vkExternalMemoryHandleType));
                 externalMemory.push_back(new clExternalMemory(
                     vkBufferListDeviceMemory[bIdx], vkExternalMemoryHandleType,
                     0, bufferSize, context, deviceId));
@@ -200,8 +213,8 @@
                 vkBufferListDeviceMemory[bIdx]->bindBuffer(vkBufferList[bIdx],
                                                            0);
                 buffers[bIdx] = externalMemory[bIdx]->getExternalMemoryBuffer();
-                vkDescriptorSet.update((uint32_t)bIdx + 1, vkBufferList[bIdx]);
             }
+            vkDescriptorSet.updateArray(1, numBuffers, vkBufferList);
             vkCommandBuffer.begin();
             vkCommandBuffer.bindPipeline(vkComputePipeline);
             vkCommandBuffer.bindDescriptorSets(
@@ -227,16 +240,27 @@
             for (uint32_t iter = 0; iter < maxIter; iter++)
             {
 
-                if (iter == 0)
+                if (use_fence)
                 {
-                    vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                    fence->reset();
+                    vkQueue.submit(vkCommandBuffer, fence);
+                    fence->wait();
                 }
                 else
                 {
-                    vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
-                                   vkVk2CLSemaphore);
+                    if (iter == 0)
+                    {
+                        vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                    }
+                    else
+                    {
+                        vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
+                                       vkVk2CLSemaphore);
+                    }
+
+                    clVk2CLExternalSemaphore->wait(cmd_queue1);
                 }
-                clVk2CLExternalSemaphore->wait(cmd_queue1);
+
 
                 err = clSetKernelArg(update_buffer_kernel, 0, sizeof(uint32_t),
                                      (void *)&bufferSize);
@@ -286,7 +310,14 @@
                     goto CLEANUP;
                 }
 
-                if (iter != (maxIter - 1))
+                if (use_fence)
+                {
+                    clFlush(cmd_queue1);
+                    clFlush(cmd_queue2);
+                    clFinish(cmd_queue1);
+                    clFinish(cmd_queue2);
+                }
+                else if (!use_fence && iter != (maxIter - 1))
                 {
                     clCl2VkExternalSemaphore->signal(cmd_queue2);
                 }
@@ -387,8 +418,11 @@
     }
     if (program) clReleaseProgram(program);
     if (kernel_cq) clReleaseKernel(kernel_cq);
-    if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
-    if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+    if (!use_fence)
+    {
+        if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
+        if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+    }
     if (error_2) free(error_2);
     if (error_1) clReleaseMemObject(error_1);
 
@@ -398,7 +432,7 @@
 int run_test_with_one_queue(cl_context &context, cl_command_queue &cmd_queue1,
                             cl_kernel *kernel, cl_kernel &verify_kernel,
                             VulkanDevice &vkDevice, uint32_t numBuffers,
-                            uint32_t bufferSize)
+                            uint32_t bufferSize, bool use_fence)
 {
     log_info("RUNNING TEST WITH ONE QUEUE...... \n\n");
     size_t global_work_size[1];
@@ -416,13 +450,17 @@
         getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
     VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
     VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    std::shared_ptr<VulkanFence> fence = nullptr;
 
     VulkanQueue &vkQueue = vkDevice.getQueue();
 
     std::vector<char> vkBufferShader = readFile("buffer.spv");
     VulkanShaderModule vkBufferShaderModule(vkDevice, vkBufferShader);
-    VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList(
-        MAX_BUFFERS + 1, VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+    VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList;
+    vkDescriptorSetLayoutBindingList.addBinding(
+        0, VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1);
+    vkDescriptorSetLayoutBindingList.addBinding(
+        1, VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER, MAX_BUFFERS);
     VulkanDescriptorSetLayout vkDescriptorSetLayout(
         vkDevice, vkDescriptorSetLayoutBindingList);
     VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout);
@@ -434,10 +472,18 @@
     VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool,
                                         vkDescriptorSetLayout);
 
-    clVk2CLExternalSemaphore = new clExternalSemaphore(
-        vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
-    clCl2VkExternalSemaphore = new clExternalSemaphore(
-        vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    if (use_fence)
+    {
+        fence = std::make_shared<VulkanFence>(vkDevice);
+    }
+    else
+    {
+        clVk2CLExternalSemaphore = new clExternalSemaphore(
+            vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+        clCl2VkExternalSemaphore = new clExternalSemaphore(
+            vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    }
+
     const uint32_t maxIter = innerIterations;
     VulkanCommandPool vkCommandPool(vkDevice);
     VulkanCommandBuffer vkCommandBuffer(vkDevice, vkCommandPool);
@@ -477,9 +523,9 @@
 
             for (size_t bIdx = 0; bIdx < numBuffers; bIdx++)
             {
-                vkBufferListDeviceMemory.push_back(
-                    new VulkanDeviceMemory(vkDevice, bufferSize, memoryType,
-                                           vkExternalMemoryHandleType));
+                vkBufferListDeviceMemory.push_back(new VulkanDeviceMemory(
+                    vkDevice, vkBufferList[bIdx], memoryType,
+                    vkExternalMemoryHandleType));
                 externalMemory.push_back(new clExternalMemory(
                     vkBufferListDeviceMemory[bIdx], vkExternalMemoryHandleType,
                     0, bufferSize, context, deviceId));
@@ -498,8 +544,9 @@
                 vkBufferListDeviceMemory[bIdx]->bindBuffer(vkBufferList[bIdx],
                                                            0);
                 buffers[bIdx] = externalMemory[bIdx]->getExternalMemoryBuffer();
-                vkDescriptorSet.update((uint32_t)bIdx + 1, vkBufferList[bIdx]);
             }
+            vkDescriptorSet.updateArray(1, vkBufferList.size(), vkBufferList);
+
             vkCommandBuffer.begin();
             vkCommandBuffer.bindPipeline(vkComputePipeline);
             vkCommandBuffer.bindDescriptorSets(
@@ -526,16 +573,26 @@
 
             for (uint32_t iter = 0; iter < maxIter; iter++)
             {
-                if (iter == 0)
+                if (use_fence)
                 {
-                    vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                    fence->reset();
+                    vkQueue.submit(vkCommandBuffer, fence);
+                    fence->wait();
                 }
                 else
                 {
-                    vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
-                                   vkVk2CLSemaphore);
+                    if (iter == 0)
+                    {
+                        vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                    }
+                    else
+                    {
+                        vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
+                                       vkVk2CLSemaphore);
+                    }
+
+                    clVk2CLExternalSemaphore->wait(cmd_queue1);
                 }
-                clVk2CLExternalSemaphore->wait(cmd_queue1);
 
                 err = clSetKernelArg(update_buffer_kernel, 0, sizeof(uint32_t),
                                      (void *)&bufferSize);
@@ -562,7 +619,12 @@
                                 " error\n");
                     goto CLEANUP;
                 }
-                if (iter != (maxIter - 1))
+                if (use_fence)
+                {
+                    clFlush(cmd_queue1);
+                    clFinish(cmd_queue1);
+                }
+                else if (!use_fence && (iter != (maxIter - 1)))
                 {
                     clCl2VkExternalSemaphore->signal(cmd_queue1);
                 }
@@ -656,8 +718,13 @@
             delete externalMemory[i];
         }
     }
-    if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
-    if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+
+    if (!use_fence)
+    {
+        if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
+        if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+    }
+
     if (error_2) free(error_2);
     if (error_1) clReleaseMemObject(error_1);
     return err;
@@ -666,7 +733,7 @@
 int run_test_with_multi_import_same_ctx(
     cl_context &context, cl_command_queue &cmd_queue1, cl_kernel *kernel,
     cl_kernel &verify_kernel, VulkanDevice &vkDevice, uint32_t numBuffers,
-    uint32_t bufferSize, uint32_t bufferSizeForOffset)
+    uint32_t bufferSize, uint32_t bufferSizeForOffset, float use_fence)
 {
     size_t global_work_size[1];
     uint8_t *error_2;
@@ -687,14 +754,18 @@
         getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
     VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
     VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    std::shared_ptr<VulkanFence> fence = nullptr;
 
     VulkanQueue &vkQueue = vkDevice.getQueue();
 
     std::vector<char> vkBufferShader = readFile("buffer.spv");
 
     VulkanShaderModule vkBufferShaderModule(vkDevice, vkBufferShader);
-    VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList(
-        MAX_BUFFERS + 1, VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+    VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList;
+    vkDescriptorSetLayoutBindingList.addBinding(
+        0, VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1);
+    vkDescriptorSetLayoutBindingList.addBinding(
+        1, VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER, MAX_BUFFERS);
     VulkanDescriptorSetLayout vkDescriptorSetLayout(
         vkDevice, vkDescriptorSetLayoutBindingList);
     VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout);
@@ -706,10 +777,18 @@
     VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool,
                                         vkDescriptorSetLayout);
 
-    clVk2CLExternalSemaphore = new clExternalSemaphore(
-        vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
-    clCl2VkExternalSemaphore = new clExternalSemaphore(
-        vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    if (use_fence)
+    {
+        fence = std::make_shared<VulkanFence>(vkDevice);
+    }
+    else
+    {
+        clVk2CLExternalSemaphore = new clExternalSemaphore(
+            vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+        clCl2VkExternalSemaphore = new clExternalSemaphore(
+            vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    }
+
     const uint32_t maxIter = innerIterations;
     VulkanCommandPool vkCommandPool(vkDevice);
     VulkanCommandBuffer vkCommandBuffer(vkDevice, vkCommandPool);
@@ -767,7 +846,7 @@
                     if (withOffset == 0)
                     {
                         vkBufferListDeviceMemory.push_back(
-                            new VulkanDeviceMemory(vkDevice, pBufferSize,
+                            new VulkanDeviceMemory(vkDevice, vkBufferList[bIdx],
                                                    memoryType,
                                                    vkExternalMemoryHandleType));
                     }
@@ -811,9 +890,8 @@
                             externalMemory[bIdx][cl_bIdx]
                                 ->getExternalMemoryBuffer();
                     }
-                    vkDescriptorSet.update((uint32_t)bIdx + 1,
-                                           vkBufferList[bIdx]);
                 }
+                vkDescriptorSet.updateArray(1, numBuffers, vkBufferList);
                 vkCommandBuffer.begin();
                 vkCommandBuffer.bindPipeline(vkComputePipeline);
                 vkCommandBuffer.bindDescriptorSets(
@@ -832,16 +910,34 @@
 
                 for (uint32_t iter = 0; iter < maxIter; iter++)
                 {
-                    if (iter == 0)
+                    if (use_fence)
                     {
-                        vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                        fence->reset();
+                        vkQueue.submit(vkCommandBuffer, fence);
+                        fence->wait();
                     }
                     else
                     {
-                        vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
-                                       vkVk2CLSemaphore);
+                        if (iter == 0)
+                        {
+                            vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                        }
+                        else
+                        {
+                            vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
+                                           vkVk2CLSemaphore);
+                        }
                     }
-                    clVk2CLExternalSemaphore->wait(cmd_queue1);
+
+                    if (use_fence)
+                    {
+                        fence->wait();
+                    }
+                    else
+                    {
+                        clVk2CLExternalSemaphore->wait(cmd_queue1);
+                    }
+
                     for (uint8_t launchIter = 0; launchIter < numImports;
                          launchIter++)
                     {
@@ -874,7 +970,11 @@
                             goto CLEANUP;
                         }
                     }
-                    if (iter != (maxIter - 1))
+                    if (use_fence)
+                    {
+                        clFinish(cmd_queue1);
+                    }
+                    else if (!use_fence && iter != (maxIter - 1))
                     {
                         clCl2VkExternalSemaphore->signal(cmd_queue1);
                     }
@@ -987,8 +1087,13 @@
             }
         }
     }
-    if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
-    if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+
+    if (!use_fence)
+    {
+        if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
+        if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+    }
+
     if (error_2) free(error_2);
     if (error_1) clReleaseMemObject(error_1);
     return err;
@@ -998,7 +1103,8 @@
     cl_context &context, cl_context &context2, cl_command_queue &cmd_queue1,
     cl_command_queue &cmd_queue2, cl_kernel *kernel1, cl_kernel *kernel2,
     cl_kernel &verify_kernel, cl_kernel verify_kernel2, VulkanDevice &vkDevice,
-    uint32_t numBuffers, uint32_t bufferSize, uint32_t bufferSizeForOffset)
+    uint32_t numBuffers, uint32_t bufferSize, uint32_t bufferSizeForOffset,
+    float use_fence)
 {
     size_t global_work_size[1];
     uint8_t *error_3;
@@ -1023,6 +1129,7 @@
         getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
     VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
     VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    std::shared_ptr<VulkanFence> fence = nullptr;
 
     VulkanQueue &vkQueue = vkDevice.getQueue();
 
@@ -1042,15 +1149,24 @@
     VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool,
                                         vkDescriptorSetLayout);
 
-    clVk2CLExternalSemaphore = new clExternalSemaphore(
-        vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
-    clCl2VkExternalSemaphore = new clExternalSemaphore(
-        vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    if (use_fence)
+    {
+        fence = std::make_shared<VulkanFence>(vkDevice);
+    }
+    else
+    {
+        clVk2CLExternalSemaphore = new clExternalSemaphore(
+            vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+        clCl2VkExternalSemaphore = new clExternalSemaphore(
+            vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
 
-    clVk2CLExternalSemaphore2 = new clExternalSemaphore(
-        vkVk2CLSemaphore, context2, vkExternalSemaphoreHandleType, deviceId);
-    clCl2VkExternalSemaphore2 = new clExternalSemaphore(
-        vkCl2VkSemaphore, context2, vkExternalSemaphoreHandleType, deviceId);
+        clVk2CLExternalSemaphore2 =
+            new clExternalSemaphore(vkVk2CLSemaphore, context2,
+                                    vkExternalSemaphoreHandleType, deviceId);
+        clCl2VkExternalSemaphore2 =
+            new clExternalSemaphore(vkCl2VkSemaphore, context2,
+                                    vkExternalSemaphoreHandleType, deviceId);
+    }
 
     const uint32_t maxIter = innerIterations;
     VulkanCommandPool vkCommandPool(vkDevice);
@@ -1192,16 +1308,33 @@
 
                 for (uint32_t iter = 0; iter < maxIter; iter++)
                 {
-                    if (iter == 0)
+                    if (use_fence)
                     {
-                        vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                        fence->reset();
+                        vkQueue.submit(vkCommandBuffer, fence);
+                        fence->wait();
                     }
                     else
                     {
-                        vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
-                                       vkVk2CLSemaphore);
+                        if (iter == 0)
+                        {
+                            vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                        }
+                        else
+                        {
+                            vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
+                                           vkVk2CLSemaphore);
+                        }
                     }
-                    clVk2CLExternalSemaphore->wait(cmd_queue1);
+
+                    if (use_fence)
+                    {
+                        fence->wait();
+                    }
+                    else
+                    {
+                        clVk2CLExternalSemaphore->wait(cmd_queue1);
+                    }
 
                     for (uint8_t launchIter = 0; launchIter < numImports;
                          launchIter++)
@@ -1235,7 +1368,11 @@
                             goto CLEANUP;
                         }
                     }
-                    if (iter != (maxIter - 1))
+                    if (use_fence)
+                    {
+                        clFinish(cmd_queue1);
+                    }
+                    else if (!use_fence && iter != (maxIter - 1))
                     {
                         clCl2VkExternalSemaphore->signal(cmd_queue1);
                     }
@@ -1243,16 +1380,33 @@
                 clFinish(cmd_queue1);
                 for (uint32_t iter = 0; iter < maxIter; iter++)
                 {
-                    if (iter == 0)
+                    if (use_fence)
                     {
-                        vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                        fence->reset();
+                        vkQueue.submit(vkCommandBuffer, fence);
+                        fence->wait();
                     }
                     else
                     {
-                        vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
-                                       vkVk2CLSemaphore);
+                        if (iter == 0)
+                        {
+                            vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                        }
+                        else
+                        {
+                            vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
+                                           vkVk2CLSemaphore);
+                        }
                     }
-                    clVk2CLExternalSemaphore2->wait(cmd_queue2);
+
+                    if (use_fence)
+                    {
+                        fence->wait();
+                    }
+                    else
+                    {
+                        clVk2CLExternalSemaphore2->wait(cmd_queue2);
+                    }
 
                     for (uint8_t launchIter = 0; launchIter < numImports;
                          launchIter++)
@@ -1286,7 +1440,11 @@
                             goto CLEANUP;
                         }
                     }
-                    if (iter != (maxIter - 1))
+                    if (use_fence)
+                    {
+                        clFinish(cmd_queue2);
+                    }
+                    else if (!use_fence && iter != (maxIter - 1))
                     {
                         clCl2VkExternalSemaphore2->signal(cmd_queue2);
                     }
@@ -1474,10 +1632,15 @@
             }
         }
     }
-    if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
-    if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
-    if (clVk2CLExternalSemaphore2) delete clVk2CLExternalSemaphore2;
-    if (clCl2VkExternalSemaphore2) delete clCl2VkExternalSemaphore2;
+
+    if (!use_fence)
+    {
+        if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
+        if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+        if (clVk2CLExternalSemaphore2) delete clVk2CLExternalSemaphore2;
+        if (clCl2VkExternalSemaphore2) delete clCl2VkExternalSemaphore2;
+    }
+
     if (error_3) free(error_3);
     if (error_1) clReleaseMemObject(error_1);
     if (error_2) clReleaseMemObject(error_2);
@@ -1485,7 +1648,8 @@
 }
 
 int test_buffer_common(cl_device_id device_, cl_context context_,
-                       cl_command_queue queue_, int numElements_)
+                       cl_command_queue queue_, int numElements_,
+                       float use_fence)
 {
 
     int current_device = 0;
@@ -1738,26 +1902,26 @@
             {
                 errNum = run_test_with_multi_import_same_ctx(
                     context, cmd_queue1, kernel, verify_kernel, vkDevice,
-                    numBuffers, bufferSize, bufferSizeForOffset);
+                    numBuffers, bufferSize, bufferSizeForOffset, use_fence);
             }
             else if (multiImport && multiCtx)
             {
                 errNum = run_test_with_multi_import_diff_ctx(
                     context, context2, cmd_queue1, cmd_queue3, kernel, kernel2,
                     verify_kernel, verify_kernel2, vkDevice, numBuffers,
-                    bufferSize, bufferSizeForOffset);
+                    bufferSize, bufferSizeForOffset, use_fence);
             }
             else if (numCQ == 2)
             {
                 errNum = run_test_with_two_queue(
                     context, cmd_queue1, cmd_queue2, kernel, verify_kernel,
-                    vkDevice, numBuffers + 1, bufferSize);
+                    vkDevice, numBuffers + 1, bufferSize, use_fence);
             }
             else
             {
-                errNum = run_test_with_one_queue(context, cmd_queue1, kernel,
-                                                 verify_kernel, vkDevice,
-                                                 numBuffers, bufferSize);
+                errNum = run_test_with_one_queue(
+                    context, cmd_queue1, kernel, verify_kernel, vkDevice,
+                    numBuffers, bufferSize, use_fence);
             }
             if (errNum != CL_SUCCESS)
             {
diff --git a/test_conformance/vulkan/test_vulkan_interop_image.cpp b/test_conformance/vulkan/test_vulkan_interop_image.cpp
index 47a3166..5f1f6e4 100644
--- a/test_conformance/vulkan/test_vulkan_interop_image.cpp
+++ b/test_conformance/vulkan/test_vulkan_interop_image.cpp
@@ -226,9 +226,11 @@
     srcBufferPtr = (char *)malloc(maxImage2DSize);
     dstBufferPtr = (char *)malloc(maxImage2DSize);
 
-    VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList(
-        VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1,
-        VULKAN_DESCRIPTOR_TYPE_STORAGE_IMAGE, MAX_2D_IMAGE_DESCRIPTORS);
+    VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList;
+    vkDescriptorSetLayoutBindingList.addBinding(
+        0, VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1);
+    vkDescriptorSetLayoutBindingList.addBinding(
+        1, VULKAN_DESCRIPTOR_TYPE_STORAGE_IMAGE, MAX_2D_IMAGE_DESCRIPTORS);
     VulkanDescriptorSetLayout vkDescriptorSetLayout(
         vkDevice, vkDescriptorSetLayoutBindingList);
     VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout);
@@ -255,10 +257,10 @@
     clCl2VkExternalSemaphore = new clExternalSemaphore(
         vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
 
-    std::vector<VulkanDeviceMemory *> vkNonDedicatedImage2DListDeviceMemory1;
-    std::vector<VulkanDeviceMemory *> vkNonDedicatedImage2DListDeviceMemory2;
-    std::vector<clExternalMemoryImage *> nonDedicatedExternalMemory1;
-    std::vector<clExternalMemoryImage *> nonDedicatedExternalMemory2;
+    std::vector<VulkanDeviceMemory *> vkImage2DListDeviceMemory1;
+    std::vector<VulkanDeviceMemory *> vkImage2DListDeviceMemory2;
+    std::vector<clExternalMemoryImage *> externalMemory1;
+    std::vector<clExternalMemoryImage *> externalMemory2;
     std::vector<char> vkImage2DShader;
 
     for (size_t fIdx = 0; fIdx < vkFormatList.size(); fIdx++)
@@ -352,8 +354,6 @@
                         VulkanExternalMemoryHandleType
                             vkExternalMemoryHandleType =
                                 vkExternalMemoryHandleTypeList[emhtIdx];
-                        log_info("External memory handle type: %d \n",
-                                 vkExternalMemoryHandleType);
                         if ((true == disableNTHandleType)
                             && (VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT
                                 == vkExternalMemoryHandleType))
@@ -361,9 +361,19 @@
                             // Skip running for WIN32 NT handle.
                             continue;
                         }
+                        log_info("External memory handle type: %d \n",
+                                 vkExternalMemoryHandleType);
+                        VulkanImageTiling vulkanImageTiling =
+                            vkClExternalMemoryHandleTilingAssumption(
+                                deviceId,
+                                vkExternalMemoryHandleTypeList[emhtIdx], &err);
+                        ASSERT_SUCCESS(err,
+                                       "Failed to query OpenCL tiling mode");
+
                         VulkanImage2D vkDummyImage2D(
                             vkDevice, vkFormatList[0], widthList[0],
-                            heightList[0], 1, vkExternalMemoryHandleType);
+                            heightList[0], vulkanImageTiling, 1,
+                            vkExternalMemoryHandleType);
                         const VulkanMemoryTypeList &memoryTypeList =
                             vkDummyImage2D.getMemoryTypeList();
 
@@ -390,118 +400,73 @@
                             {
                                 VulkanImage2D vkImage2D(
                                     vkDevice, vkFormat, width, height,
-                                    numMipLevels, vkExternalMemoryHandleType);
+                                    vulkanImageTiling, numMipLevels,
+                                    vkExternalMemoryHandleType);
                                 ASSERT_LEQ(vkImage2D.getSize(), maxImage2DSize);
                                 totalImageMemSize =
                                     ROUND_UP(vkImage2D.getSize(),
                                              vkImage2D.getAlignment());
                             }
-                            VulkanImage2DList vkNonDedicatedImage2DList(
+                            VulkanImage2DList vkImage2DList(
                                 num2DImages, vkDevice, vkFormat, width, height,
-                                numMipLevels, vkExternalMemoryHandleType);
+                                vulkanImageTiling, numMipLevels,
+                                vkExternalMemoryHandleType);
                             for (size_t bIdx = 0; bIdx < num2DImages; bIdx++)
                             {
-                                if (non_dedicated)
-                                {
-                                    vkNonDedicatedImage2DListDeviceMemory1
-                                        .push_back(new VulkanDeviceMemory(
-                                            vkDevice, totalImageMemSize,
-                                            memoryType,
-                                            vkExternalMemoryHandleType));
-                                }
-                                else
-                                {
-                                    vkNonDedicatedImage2DListDeviceMemory1
-                                        .push_back(new VulkanDeviceMemory(
-                                            vkDevice,
-                                            vkNonDedicatedImage2DList[bIdx],
-                                            memoryType,
-                                            vkExternalMemoryHandleType));
-                                }
-                                vkNonDedicatedImage2DListDeviceMemory1[bIdx]
-                                    ->bindImage(vkNonDedicatedImage2DList[bIdx],
-                                                0);
-                                nonDedicatedExternalMemory1.push_back(
+                                vkImage2DListDeviceMemory1.push_back(
+                                    new VulkanDeviceMemory(
+                                        vkDevice, vkImage2DList[bIdx],
+                                        memoryType,
+                                        vkExternalMemoryHandleType));
+                                vkImage2DListDeviceMemory1[bIdx]->bindImage(
+                                    vkImage2DList[bIdx], 0);
+                                externalMemory1.push_back(
                                     new clExternalMemoryImage(
-                                        *vkNonDedicatedImage2DListDeviceMemory1
-                                            [bIdx],
+                                        *vkImage2DListDeviceMemory1[bIdx],
                                         vkExternalMemoryHandleType, context,
                                         totalImageMemSize, width, height, 0,
-                                        vkNonDedicatedImage2DList[bIdx],
-                                        deviceId));
+                                        vkImage2DList[bIdx], deviceId));
                             }
-                            VulkanImageViewList vkNonDedicatedImage2DViewList(
-                                vkDevice, vkNonDedicatedImage2DList);
-                            VulkanImage2DList vkNonDedicatedImage2DList2(
+                            VulkanImageViewList vkImage2DViewList(
+                                vkDevice, vkImage2DList);
+                            VulkanImage2DList vkImage2DList2(
                                 num2DImages, vkDevice, vkFormat, width, height,
-                                numMipLevels, vkExternalMemoryHandleType);
+                                vulkanImageTiling, numMipLevels,
+                                vkExternalMemoryHandleType);
                             for (size_t bIdx = 0; bIdx < num2DImages; bIdx++)
                             {
-                                if (non_dedicated)
-                                {
-                                    vkNonDedicatedImage2DListDeviceMemory2
-                                        .push_back(new VulkanDeviceMemory(
-                                            vkDevice, totalImageMemSize,
-                                            memoryType,
-                                            vkExternalMemoryHandleType));
-                                }
-                                else
-                                {
-                                    vkNonDedicatedImage2DListDeviceMemory2
-                                        .push_back(new VulkanDeviceMemory(
-                                            vkDevice,
-                                            vkNonDedicatedImage2DList2[bIdx],
-                                            memoryType,
-                                            vkExternalMemoryHandleType));
-                                }
-                                vkNonDedicatedImage2DListDeviceMemory2[bIdx]
-                                    ->bindImage(
-                                        vkNonDedicatedImage2DList2[bIdx], 0);
-                                nonDedicatedExternalMemory2.push_back(
+                                vkImage2DListDeviceMemory2.push_back(
+                                    new VulkanDeviceMemory(
+                                        vkDevice, vkImage2DList2[bIdx],
+                                        memoryType,
+                                        vkExternalMemoryHandleType));
+                                vkImage2DListDeviceMemory2[bIdx]->bindImage(
+                                    vkImage2DList2[bIdx], 0);
+                                externalMemory2.push_back(
                                     new clExternalMemoryImage(
-                                        *vkNonDedicatedImage2DListDeviceMemory2
-                                            [bIdx],
+                                        *vkImage2DListDeviceMemory2[bIdx],
                                         vkExternalMemoryHandleType, context,
                                         totalImageMemSize, width, height, 0,
-                                        vkNonDedicatedImage2DList2[bIdx],
-                                        deviceId));
+                                        vkImage2DList2[bIdx], deviceId));
                             }
-                            VulkanImageViewList vkDedicatedImage2DViewList(
-                                vkDevice, vkNonDedicatedImage2DList2);
 
                             cl_mem external_mem_image1[5];
                             cl_mem external_mem_image2[5];
                             for (int i = 0; i < num2DImages; i++)
                             {
                                 external_mem_image1[i] =
-                                    nonDedicatedExternalMemory1[i]
+                                    externalMemory1[i]
                                         ->getExternalMemoryImage();
                                 external_mem_image2[i] =
-                                    nonDedicatedExternalMemory2[i]
+                                    externalMemory2[i]
                                         ->getExternalMemoryImage();
                             }
-                            VulkanImage2DList &vkImage2DList =
-                                vkNonDedicatedImage2DList;
-                            VulkanImageViewList &vkImage2DViewList =
-                                vkNonDedicatedImage2DViewList;
 
                             clCl2VkExternalSemaphore->signal(cmd_queue1);
                             if (!useSingleImageKernel)
                             {
-                                for (size_t i2DIdx = 0;
-                                     i2DIdx < vkImage2DList.size(); i2DIdx++)
-                                {
-                                    for (uint32_t mipLevel = 0;
-                                         mipLevel < numMipLevels; mipLevel++)
-                                    {
-                                        uint32_t i2DvIdx =
-                                            (uint32_t)(i2DIdx * numMipLevels)
-                                            + mipLevel;
-                                        vkDescriptorSet.update(
-                                            1 + i2DvIdx,
-                                            vkImage2DViewList[i2DvIdx]);
-                                    }
-                                }
+                                vkDescriptorSet.updateArray(1,
+                                                            vkImage2DViewList);
                                 vkCopyCommandBuffer.begin();
                                 vkCopyCommandBuffer.pipelineBarrier(
                                     vkImage2DList,
@@ -743,29 +708,25 @@
                             }
                             for (int i = 0; i < num2DImages; i++)
                             {
-                                delete vkNonDedicatedImage2DListDeviceMemory1
-                                    [i];
-                                delete vkNonDedicatedImage2DListDeviceMemory2
-                                    [i];
-                                delete nonDedicatedExternalMemory1[i];
-                                delete nonDedicatedExternalMemory2[i];
+                                delete vkImage2DListDeviceMemory1[i];
+                                delete vkImage2DListDeviceMemory2[i];
+                                delete externalMemory1[i];
+                                delete externalMemory2[i];
                             }
-                            vkNonDedicatedImage2DListDeviceMemory1.erase(
-                                vkNonDedicatedImage2DListDeviceMemory1.begin(),
-                                vkNonDedicatedImage2DListDeviceMemory1.begin()
+                            vkImage2DListDeviceMemory1.erase(
+                                vkImage2DListDeviceMemory1.begin(),
+                                vkImage2DListDeviceMemory1.begin()
                                     + num2DImages);
-                            vkNonDedicatedImage2DListDeviceMemory2.erase(
-                                vkNonDedicatedImage2DListDeviceMemory2.begin(),
-                                vkNonDedicatedImage2DListDeviceMemory2.begin()
+                            vkImage2DListDeviceMemory2.erase(
+                                vkImage2DListDeviceMemory2.begin(),
+                                vkImage2DListDeviceMemory2.begin()
                                     + num2DImages);
-                            nonDedicatedExternalMemory1.erase(
-                                nonDedicatedExternalMemory1.begin(),
-                                nonDedicatedExternalMemory1.begin()
-                                    + num2DImages);
-                            nonDedicatedExternalMemory2.erase(
-                                nonDedicatedExternalMemory2.begin(),
-                                nonDedicatedExternalMemory2.begin()
-                                    + num2DImages);
+                            externalMemory1.erase(externalMemory1.begin(),
+                                                  externalMemory1.begin()
+                                                      + num2DImages);
+                            externalMemory2.erase(externalMemory2.begin(),
+                                                  externalMemory2.begin()
+                                                      + num2DImages);
                             if (CL_SUCCESS != err)
                             {
                                 goto CLEANUP;
@@ -822,9 +783,11 @@
     srcBufferPtr = (char *)malloc(maxImage2DSize);
     dstBufferPtr = (char *)malloc(maxImage2DSize);
 
-    VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList(
-        VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1,
-        VULKAN_DESCRIPTOR_TYPE_STORAGE_IMAGE, MAX_2D_IMAGE_DESCRIPTORS);
+    VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList;
+    vkDescriptorSetLayoutBindingList.addBinding(
+        0, VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1);
+    vkDescriptorSetLayoutBindingList.addBinding(
+        1, VULKAN_DESCRIPTOR_TYPE_STORAGE_IMAGE, MAX_2D_IMAGE_DESCRIPTORS);
     VulkanDescriptorSetLayout vkDescriptorSetLayout(
         vkDevice, vkDescriptorSetLayoutBindingList);
     VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout);
@@ -851,10 +814,10 @@
     clCl2VkExternalSemaphore = new clExternalSemaphore(
         vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
 
-    std::vector<VulkanDeviceMemory *> vkNonDedicatedImage2DListDeviceMemory1;
-    std::vector<VulkanDeviceMemory *> vkNonDedicatedImage2DListDeviceMemory2;
-    std::vector<clExternalMemoryImage *> nonDedicatedExternalMemory1;
-    std::vector<clExternalMemoryImage *> nonDedicatedExternalMemory2;
+    std::vector<VulkanDeviceMemory *> vkImage2DListDeviceMemory1;
+    std::vector<VulkanDeviceMemory *> vkImage2DListDeviceMemory2;
+    std::vector<clExternalMemoryImage *> externalMemory1;
+    std::vector<clExternalMemoryImage *> externalMemory2;
     std::vector<char> vkImage2DShader;
 
     for (size_t fIdx = 0; fIdx < vkFormatList.size(); fIdx++)
@@ -957,9 +920,18 @@
                             // Skip running for WIN32 NT handle.
                             continue;
                         }
+
+                        VulkanImageTiling vulkanImageTiling =
+                            vkClExternalMemoryHandleTilingAssumption(
+                                deviceId,
+                                vkExternalMemoryHandleTypeList[emhtIdx], &err);
+                        ASSERT_SUCCESS(err,
+                                       "Failed to query OpenCL tiling mode");
+
                         VulkanImage2D vkDummyImage2D(
                             vkDevice, vkFormatList[0], widthList[0],
-                            heightList[0], 1, vkExternalMemoryHandleType);
+                            heightList[0], vulkanImageTiling, 1,
+                            vkExternalMemoryHandleType);
                         const VulkanMemoryTypeList &memoryTypeList =
                             vkDummyImage2D.getMemoryTypeList();
 
@@ -985,98 +957,78 @@
                             {
                                 VulkanImage2D vkImage2D(
                                     vkDevice, vkFormat, width, height,
-                                    numMipLevels, vkExternalMemoryHandleType);
+                                    vulkanImageTiling, numMipLevels,
+                                    vkExternalMemoryHandleType);
                                 ASSERT_LEQ(vkImage2D.getSize(), maxImage2DSize);
                                 totalImageMemSize =
                                     ROUND_UP(vkImage2D.getSize(),
                                              vkImage2D.getAlignment());
                             }
-                            VulkanImage2DList vkNonDedicatedImage2DList(
+                            VulkanImage2DList vkImage2DList(
                                 num2DImages, vkDevice, vkFormat, width, height,
-                                numMipLevels, vkExternalMemoryHandleType);
-                            for (size_t bIdx = 0;
-                                 bIdx < vkNonDedicatedImage2DList.size();
+                                vulkanImageTiling, numMipLevels,
+                                vkExternalMemoryHandleType);
+                            for (size_t bIdx = 0; bIdx < vkImage2DList.size();
                                  bIdx++)
                             {
                                 // Create list of Vulkan device memories and
                                 // bind the list of Vulkan images.
-                                vkNonDedicatedImage2DListDeviceMemory1
-                                    .push_back(new VulkanDeviceMemory(
-                                        vkDevice, totalImageMemSize, memoryType,
+                                vkImage2DListDeviceMemory1.push_back(
+                                    new VulkanDeviceMemory(
+                                        vkDevice, vkImage2DList[bIdx],
+                                        memoryType,
                                         vkExternalMemoryHandleType));
-                                vkNonDedicatedImage2DListDeviceMemory1[bIdx]
-                                    ->bindImage(vkNonDedicatedImage2DList[bIdx],
-                                                0);
-                                nonDedicatedExternalMemory1.push_back(
+                                vkImage2DListDeviceMemory1[bIdx]->bindImage(
+                                    vkImage2DList[bIdx], 0);
+                                externalMemory1.push_back(
                                     new clExternalMemoryImage(
-                                        *vkNonDedicatedImage2DListDeviceMemory1
-                                            [bIdx],
+                                        *vkImage2DListDeviceMemory1[bIdx],
                                         vkExternalMemoryHandleType, context,
                                         totalImageMemSize, width, height, 0,
-                                        vkNonDedicatedImage2DList[bIdx],
-                                        deviceId));
+                                        vkImage2DList[bIdx], deviceId));
                             }
-                            VulkanImageViewList vkNonDedicatedImage2DViewList(
-                                vkDevice, vkNonDedicatedImage2DList);
+                            VulkanImageViewList vkImage2DViewList(
+                                vkDevice, vkImage2DList);
 
-                            VulkanImage2DList vkNonDedicatedImage2DList2(
+                            VulkanImage2DList vkImage2DList2(
                                 num2DImages, vkDevice, vkFormat, width, height,
-                                numMipLevels, vkExternalMemoryHandleType);
-                            for (size_t bIdx = 0;
-                                 bIdx < vkNonDedicatedImage2DList2.size();
+                                vulkanImageTiling, numMipLevels,
+                                vkExternalMemoryHandleType);
+                            for (size_t bIdx = 0; bIdx < vkImage2DList2.size();
                                  bIdx++)
                             {
-                                vkNonDedicatedImage2DListDeviceMemory2
-                                    .push_back(new VulkanDeviceMemory(
-                                        vkDevice, totalImageMemSize, memoryType,
+                                vkImage2DListDeviceMemory2.push_back(
+                                    new VulkanDeviceMemory(
+                                        vkDevice, vkImage2DList2[bIdx],
+                                        memoryType,
                                         vkExternalMemoryHandleType));
-                                vkNonDedicatedImage2DListDeviceMemory2[bIdx]
-                                    ->bindImage(
-                                        vkNonDedicatedImage2DList2[bIdx], 0);
-                                nonDedicatedExternalMemory2.push_back(
+                                vkImage2DListDeviceMemory2[bIdx]->bindImage(
+                                    vkImage2DList2[bIdx], 0);
+                                externalMemory2.push_back(
                                     new clExternalMemoryImage(
-                                        *vkNonDedicatedImage2DListDeviceMemory2
-                                            [bIdx],
+                                        *vkImage2DListDeviceMemory2[bIdx],
                                         vkExternalMemoryHandleType, context,
                                         totalImageMemSize, width, height, 0,
-                                        vkNonDedicatedImage2DList2[bIdx],
-                                        deviceId));
+                                        vkImage2DList2[bIdx], deviceId));
                             }
-                            VulkanImageViewList vkDedicatedImage2DViewList(
-                                vkDevice, vkNonDedicatedImage2DList2);
+
                             cl_mem external_mem_image1[4];
                             cl_mem external_mem_image2[4];
                             for (int i = 0; i < num2DImages; i++)
                             {
                                 external_mem_image1[i] =
-                                    nonDedicatedExternalMemory1[i]
+                                    externalMemory1[i]
                                         ->getExternalMemoryImage();
                                 external_mem_image2[i] =
-                                    nonDedicatedExternalMemory2[i]
+                                    externalMemory2[i]
                                         ->getExternalMemoryImage();
                             }
-                            VulkanImage2DList &vkImage2DList =
-                                vkNonDedicatedImage2DList;
-                            VulkanImageViewList &vkImage2DViewList =
-                                vkNonDedicatedImage2DViewList;
 
                             clCl2VkExternalSemaphore->signal(cmd_queue1);
                             if (!useSingleImageKernel)
                             {
-                                for (size_t i2DIdx = 0;
-                                     i2DIdx < vkImage2DList.size(); i2DIdx++)
-                                {
-                                    for (uint32_t mipLevel = 0;
-                                         mipLevel < numMipLevels; mipLevel++)
-                                    {
-                                        uint32_t i2DvIdx =
-                                            (uint32_t)(i2DIdx * numMipLevels)
-                                            + mipLevel;
-                                        vkDescriptorSet.update(
-                                            1 + i2DvIdx,
-                                            vkImage2DViewList[i2DvIdx]);
-                                    }
-                                }
+                                vkDescriptorSet.updateArray(1,
+                                                            vkImage2DViewList);
                                 vkCopyCommandBuffer.begin();
                                 vkCopyCommandBuffer.pipelineBarrier(
                                     vkImage2DList,
@@ -1275,29 +1227,25 @@
                             }
                             for (int i = 0; i < num2DImages; i++)
                             {
-                                delete vkNonDedicatedImage2DListDeviceMemory1
-                                    [i];
-                                delete vkNonDedicatedImage2DListDeviceMemory2
-                                    [i];
-                                delete nonDedicatedExternalMemory1[i];
-                                delete nonDedicatedExternalMemory2[i];
+                                delete vkImage2DListDeviceMemory1[i];
+                                delete vkImage2DListDeviceMemory2[i];
+                                delete externalMemory1[i];
+                                delete externalMemory2[i];
                             }
-                            vkNonDedicatedImage2DListDeviceMemory1.erase(
-                                vkNonDedicatedImage2DListDeviceMemory1.begin(),
-                                vkNonDedicatedImage2DListDeviceMemory1.begin()
+                            vkImage2DListDeviceMemory1.erase(
+                                vkImage2DListDeviceMemory1.begin(),
+                                vkImage2DListDeviceMemory1.begin()
                                     + num2DImages);
-                            vkNonDedicatedImage2DListDeviceMemory2.erase(
-                                vkNonDedicatedImage2DListDeviceMemory2.begin(),
-                                vkNonDedicatedImage2DListDeviceMemory2.begin()
+                            vkImage2DListDeviceMemory2.erase(
+                                vkImage2DListDeviceMemory2.begin(),
+                                vkImage2DListDeviceMemory2.begin()
                                     + num2DImages);
-                            nonDedicatedExternalMemory1.erase(
-                                nonDedicatedExternalMemory1.begin(),
-                                nonDedicatedExternalMemory1.begin()
-                                    + num2DImages);
-                            nonDedicatedExternalMemory2.erase(
-                                nonDedicatedExternalMemory2.begin(),
-                                nonDedicatedExternalMemory2.begin()
-                                    + num2DImages);
+                            externalMemory1.erase(externalMemory1.begin(),
+                                                  externalMemory1.begin()
+                                                      + num2DImages);
+                            externalMemory2.erase(externalMemory2.begin(),
+                                                  externalMemory2.begin()
+                                                      + num2DImages);
                             if (CL_SUCCESS != err)
                             {
                                 goto CLEANUP;
diff --git a/test_conformance/vulkan/vulkan_interop_common.hpp b/test_conformance/vulkan/vulkan_interop_common.hpp
index 18d84f0..a116240 100644
--- a/test_conformance/vulkan/vulkan_interop_common.hpp
+++ b/test_conformance/vulkan/vulkan_interop_common.hpp
@@ -45,6 +45,5 @@
 extern bool disableNTHandleType;
 // Enable offset for multiImport of vulkan device memory
 extern bool enableOffset;
-extern bool non_dedicated;
 
 #endif // _vulkan_interop_common_hpp_
diff --git a/test_conformance/workgroups/test_wg_all.cpp b/test_conformance/workgroups/test_wg_all.cpp
index 41abd12..f9b574e 100644
--- a/test_conformance/workgroups/test_wg_all.cpp
+++ b/test_conformance/workgroups/test_wg_all.cpp
@@ -75,7 +75,6 @@
     size_t       wg_size[1];
     size_t       num_elements;
     int          err;
-    int          i;
     MTdata       d;
 
     err = create_single_kernel_helper(context, &program, &kernel, 1,
@@ -110,7 +109,7 @@
 
     p = input_ptr[0];
     d = init_genrand( gRandomSeed );
-    for (i=0; i<(num_elements+1); i++)
+    for (size_t i = 0; i < (num_elements + 1); i++)
     {
         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
     }
diff --git a/test_conformance/workgroups/test_wg_any.cpp b/test_conformance/workgroups/test_wg_any.cpp
index e0242cf..f7ff899 100644
--- a/test_conformance/workgroups/test_wg_any.cpp
+++ b/test_conformance/workgroups/test_wg_any.cpp
@@ -75,7 +75,6 @@
     size_t       wg_size[1];
     size_t       num_elements;
     int          err;
-    int          i;
     MTdata       d;
 
     err = create_single_kernel_helper(context, &program, &kernel, 1,
@@ -110,7 +109,7 @@
 
     p = input_ptr[0];
     d = init_genrand( gRandomSeed );
-    for (i=0; i<(num_elements+1); i++)
+    for (size_t i = 0; i < (num_elements + 1); i++)
     {
         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
     }
diff --git a/test_conformance/workgroups/test_wg_broadcast.cpp b/test_conformance/workgroups/test_wg_broadcast.cpp
index e24ac7b..a4cb0c6 100644
--- a/test_conformance/workgroups/test_wg_broadcast.cpp
+++ b/test_conformance/workgroups/test_wg_broadcast.cpp
@@ -70,7 +70,7 @@
 
     for (i=0,group_id=0; i<n; i+=wg_size,group_id++)
     {
-        int local_size = (n-i) > wg_size ? wg_size : (n-i);
+        size_t local_size = (n - i) > wg_size ? wg_size : (n - i);
         float broadcast_result = inptr[i + (group_id % local_size)];
         for (j=0; j<local_size; j++)
         {
@@ -172,7 +172,6 @@
     size_t       wg_size[1];
     size_t       num_elements;
     int          err;
-    int          i;
     MTdata       d;
 
     err = create_single_kernel_helper(context, &program, &kernel, 1,
@@ -207,7 +206,7 @@
 
     p = input_ptr[0];
     d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
     }
@@ -278,7 +277,6 @@
     size_t       num_workgroups;
     size_t       num_elements;
     int          err;
-    int          i;
     MTdata       d;
 
     err = create_single_kernel_helper(context, &program, &kernel, 1,
@@ -333,7 +331,7 @@
 
     p = input_ptr[0];
     d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
     }
@@ -402,7 +400,6 @@
     size_t       num_workgroups;
     size_t       num_elements;
     int          err;
-    int          i;
     MTdata       d;
 
     err = create_single_kernel_helper(context, &program, &kernel, 1,
@@ -458,7 +455,7 @@
 
     p = input_ptr[0];
     d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
     }
diff --git a/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp b/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
index 648e68c..a31fca6 100644
--- a/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
+++ b/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
@@ -219,10 +219,8 @@
 int do_test_work_group_suggested_local_size(
     cl_device_id device, cl_context context, cl_command_queue queue,
     bool (*skip_cond)(size_t), size_t start, size_t end, size_t incr,
-    cl_long max_local_mem_size, size_t global_work_offset[], num_dims dim)
+    cl_ulong max_local_mem_size, size_t global_work_offset[], num_dims dim)
 {
-    clProgramWrapper scan_program;
-    clKernelWrapper scan_kernel;
     int err;
     size_t test_values[] = { 1, 1, 1 };
     std::string kernel_names[6] = {
@@ -244,6 +242,8 @@
     for (int kernel_num = 0; kernel_num < 6; kernel_num++)
     {
         if (max_local_mem_size < local_mem_size[kernel_num]) continue;
+        clProgramWrapper scan_program;
+        clKernelWrapper scan_kernel;
         // Create the kernel
         err = create_single_kernel_helper(
             context, &scan_program, &scan_kernel, 1,
@@ -300,7 +300,7 @@
                  "Skipping the test.\n");
         return TEST_SKIPPED_ITSELF;
     }
-    cl_long max_local_mem_size;
+    cl_ulong max_local_mem_size;
     cl_int err =
         clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE,
                         sizeof(max_local_mem_size), &max_local_mem_size, NULL);